Reinstate the shortcut-only attribute

Also add the blacklist attribute

Bug: 7005742
Bug: 2704000
Change-Id: Icbe60bdf25bfb098d9e3f20870be30d6aef07c9d
main
Jean Chalard 2012-08-31 15:24:39 +09:00
parent 49d8af8a4e
commit 72b1c93941
11 changed files with 125 additions and 42 deletions

View File

@ -172,12 +172,12 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
// considering performance regression.
protected void addWord(final String word, final String shortcutTarget, final int frequency) {
if (shortcutTarget == null) {
mFusionDictionary.add(word, frequency, null);
mFusionDictionary.add(word, frequency, null, false /* isNotAWord */);
} else {
// TODO: Do this in the subclass, with this class taking an arraylist.
final ArrayList<WeightedString> shortcutTargets = CollectionUtils.newArrayList();
shortcutTargets.add(new WeightedString(shortcutTarget, frequency));
mFusionDictionary.add(word, frequency, shortcutTargets);
mFusionDictionary.add(word, frequency, shortcutTargets, false /* isNotAWord */);
}
}

View File

@ -55,6 +55,8 @@ public class BinaryDictInputOutput {
* s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
* | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
* | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD
* | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED
*
* c | IF FLAG_HAS_MULTIPLE_CHARS
* h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
@ -153,6 +155,8 @@ public class BinaryDictInputOutput {
private static final int FLAG_IS_TERMINAL = 0x10;
private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
private static final int FLAG_HAS_BIGRAMS = 0x04;
private static final int FLAG_IS_NOT_A_WORD = 0x02;
private static final int FLAG_IS_BLACKLISTED = 0x01;
private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
@ -778,6 +782,12 @@ public class BinaryDictInputOutput {
}
flags |= FLAG_HAS_BIGRAMS;
}
if (group.mIsNotAWord) {
flags |= FLAG_IS_NOT_A_WORD;
}
if (group.mIsBlacklistEntry) {
flags |= FLAG_IS_BLACKLISTED;
}
return flags;
}
@ -1352,12 +1362,14 @@ public class BinaryDictInputOutput {
buffer.position(currentPosition);
}
nodeContents.add(
new CharGroup(info.mCharacters, shortcutTargets,
bigrams, info.mFrequency, children));
new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
0 != (info.mFlags & FLAG_IS_NOT_A_WORD),
0 != (info.mFlags & FLAG_IS_BLACKLISTED), children));
} else {
nodeContents.add(
new CharGroup(info.mCharacters, shortcutTargets,
bigrams, info.mFrequency));
new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
0 != (info.mFlags & FLAG_IS_NOT_A_WORD),
0 != (info.mFlags & FLAG_IS_BLACKLISTED)));
}
groupOffset = info.mEndAddress;
}
@ -1478,7 +1490,11 @@ public class BinaryDictInputOutput {
0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
if (null != dict) {
for (final Word w : dict) {
newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets);
if (w.mIsBlacklistEntry) {
newDict.addBlacklistEntry(w.mWord, w.mShortcutTargets, w.mIsNotAWord);
} else {
newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mIsNotAWord);
}
}
for (final Word w : dict) {
// By construction a binary dictionary may not have bigrams pointing to

View File

@ -101,26 +101,34 @@ public class FusionDictionary implements Iterable<Word> {
ArrayList<WeightedString> mBigrams;
int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
Node mChildren;
boolean mIsNotAWord; // Only a shortcut
boolean mIsBlacklistEntry;
// The two following members to help with binary generation
int mCachedSize;
int mCachedAddress;
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams, final int frequency) {
final ArrayList<WeightedString> bigrams, final int frequency,
final boolean isNotAWord, final boolean isBlacklistEntry) {
mChars = chars;
mFrequency = frequency;
mShortcutTargets = shortcutTargets;
mBigrams = bigrams;
mChildren = null;
mIsNotAWord = isNotAWord;
mIsBlacklistEntry = isBlacklistEntry;
}
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams, final int frequency, final Node children) {
final ArrayList<WeightedString> bigrams, final int frequency,
final boolean isNotAWord, final boolean isBlacklistEntry, final Node children) {
mChars = chars;
mFrequency = frequency;
mShortcutTargets = shortcutTargets;
mBigrams = bigrams;
mChildren = children;
mIsNotAWord = isNotAWord;
mIsBlacklistEntry = isBlacklistEntry;
}
public void addChild(CharGroup n) {
@ -197,8 +205,9 @@ public class FusionDictionary implements Iterable<Word> {
* the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
* updated if they are higher than the existing ones.
*/
public void update(int frequency, ArrayList<WeightedString> shortcutTargets,
ArrayList<WeightedString> bigrams) {
public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams,
final boolean isNotAWord, final boolean isBlacklistEntry) {
if (frequency > mFrequency) {
mFrequency = frequency;
}
@ -234,6 +243,8 @@ public class FusionDictionary implements Iterable<Word> {
}
}
}
mIsNotAWord = isNotAWord;
mIsBlacklistEntry = isBlacklistEntry;
}
}
@ -296,10 +307,24 @@ public class FusionDictionary implements Iterable<Word> {
* @param word the word to add.
* @param frequency the frequency of the word, in the range [0..255].
* @param shortcutTargets a list of shortcut targets for this word, or null.
* @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
*/
public void add(final String word, final int frequency,
final ArrayList<WeightedString> shortcutTargets) {
add(getCodePoints(word), frequency, shortcutTargets);
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
add(getCodePoints(word), frequency, shortcutTargets, isNotAWord,
false /* isBlacklistEntry */);
}
/**
* Helper method to add a blacklist entry as a string.
*
* @param word the word to add as a blacklist entry.
* @param shortcutTargets a list of shortcut targets for this word, or null.
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
*/
public void addBlacklistEntry(final String word,
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */);
}
/**
@ -332,7 +357,8 @@ public class FusionDictionary implements Iterable<Word> {
if (charGroup != null) {
final CharGroup charGroup2 = findWordInTree(mRoot, word2);
if (charGroup2 == null) {
add(getCodePoints(word2), 0, null);
add(getCodePoints(word2), 0, null, false /* isNotAWord */,
false /* isBlacklistEntry */);
}
charGroup.addBigram(word2, frequency);
} else {
@ -349,9 +375,12 @@ public class FusionDictionary implements Iterable<Word> {
* @param word the word, as an int array.
* @param frequency the frequency of the word, in the range [0..255].
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
* @param isBlacklistEntry true if this is a blacklisted word, false otherwise
*/
private void add(final int[] word, final int frequency,
final ArrayList<WeightedString> shortcutTargets) {
final ArrayList<WeightedString> shortcutTargets,
final boolean isNotAWord, final boolean isBlacklistEntry) {
assert(frequency >= 0 && frequency <= 255);
Node currentNode = mRoot;
int charIndex = 0;
@ -376,7 +405,7 @@ public class FusionDictionary implements Iterable<Word> {
final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
final CharGroup newGroup = new CharGroup(
Arrays.copyOfRange(word, charIndex, word.length),
shortcutTargets, null /* bigrams */, frequency);
shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry);
currentNode.mData.add(insertionIndex, newGroup);
if (DBG) checkStack(currentNode);
} else {
@ -386,13 +415,15 @@ public class FusionDictionary implements Iterable<Word> {
// The new word is a prefix of an existing word, but the node on which it
// should end already exists as is. Since the old CharNode was not a terminal,
// make it one by filling in its frequency and other attributes
currentGroup.update(frequency, shortcutTargets, null);
currentGroup.update(frequency, shortcutTargets, null, isNotAWord,
isBlacklistEntry);
} else {
// The new word matches the full old word and extends past it.
// We only have to create a new node and add it to the end of this.
final CharGroup newNode = new CharGroup(
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
shortcutTargets, null /* bigrams */, frequency);
shortcutTargets, null /* bigrams */, frequency, isNotAWord,
isBlacklistEntry);
currentGroup.mChildren = new Node();
currentGroup.mChildren.mData.add(newNode);
}
@ -400,7 +431,9 @@ public class FusionDictionary implements Iterable<Word> {
if (0 == differentCharIndex) {
// Exact same word. Update the frequency if higher. This will also add the
// new shortcuts to the existing shortcut list if it already exists.
currentGroup.update(frequency, shortcutTargets, null);
currentGroup.update(frequency, shortcutTargets, null,
currentGroup.mIsNotAWord && isNotAWord,
currentGroup.mIsBlacklistEntry || isBlacklistEntry);
} else {
// Partial prefix match only. We have to replace the current node with a node
// containing the current prefix and create two new ones for the tails.
@ -408,21 +441,26 @@ public class FusionDictionary implements Iterable<Word> {
final CharGroup newOldWord = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, differentCharIndex,
currentGroup.mChars.length), currentGroup.mShortcutTargets,
currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren);
currentGroup.mBigrams, currentGroup.mFrequency,
currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry,
currentGroup.mChildren);
newChildren.mData.add(newOldWord);
final CharGroup newParent;
if (charIndex + differentCharIndex >= word.length) {
newParent = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
shortcutTargets, null /* bigrams */, frequency, newChildren);
shortcutTargets, null /* bigrams */, frequency,
isNotAWord, isBlacklistEntry, newChildren);
} else {
newParent = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
null /* shortcutTargets */, null /* bigrams */, -1, newChildren);
null /* shortcutTargets */, null /* bigrams */, -1,
false /* isNotAWord */, false /* isBlacklistEntry */, newChildren);
final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
charIndex + differentCharIndex, word.length),
shortcutTargets, null /* bigrams */, frequency);
shortcutTargets, null /* bigrams */, frequency,
isNotAWord, isBlacklistEntry);
final int addIndex = word[charIndex + differentCharIndex]
> currentGroup.mChars[differentCharIndex] ? 1 : 0;
newChildren.mData.add(addIndex, newWord);
@ -483,7 +521,8 @@ public class FusionDictionary implements Iterable<Word> {
private static int findInsertionIndex(final Node node, int character) {
final ArrayList<CharGroup> data = node.mData;
final CharGroup reference = new CharGroup(new int[] { character },
null /* shortcutTargets */, null /* bigrams */, 0);
null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */,
false /* isBlacklistEntry */);
int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
return result >= 0 ? result : -result - 1;
}
@ -748,7 +787,8 @@ public class FusionDictionary implements Iterable<Word> {
}
if (currentGroup.mFrequency >= 0)
return new Word(mCurrentString.toString(), currentGroup.mFrequency,
currentGroup.mShortcutTargets, currentGroup.mBigrams);
currentGroup.mShortcutTargets, currentGroup.mBigrams,
currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry);
} else {
mPositions.removeLast();
currentPos = mPositions.getLast();

View File

@ -31,16 +31,21 @@ public class Word implements Comparable<Word> {
public final int mFrequency;
public final ArrayList<WeightedString> mShortcutTargets;
public final ArrayList<WeightedString> mBigrams;
public final boolean mIsNotAWord;
public final boolean mIsBlacklistEntry;
private int mHashCode = 0;
public Word(final String word, final int frequency,
final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams) {
final ArrayList<WeightedString> bigrams,
final boolean isNotAWord, final boolean isBlacklistEntry) {
mWord = word;
mFrequency = frequency;
mShortcutTargets = shortcutTargets;
mBigrams = bigrams;
mIsNotAWord = isNotAWord;
mIsBlacklistEntry = isBlacklistEntry;
}
private static int computeHashCode(Word word) {
@ -48,7 +53,9 @@ public class Word implements Comparable<Word> {
word.mWord,
word.mFrequency,
word.mShortcutTargets.hashCode(),
word.mBigrams.hashCode()
word.mBigrams.hashCode(),
word.mIsNotAWord,
word.mIsBlacklistEntry
});
}
@ -78,7 +85,9 @@ public class Word implements Comparable<Word> {
Word w = (Word)o;
return mFrequency == w.mFrequency && mWord.equals(w.mWord)
&& mShortcutTargets.equals(w.mShortcutTargets)
&& mBigrams.equals(w.mBigrams);
&& mBigrams.equals(w.mBigrams)
&& mIsNotAWord == w.mIsNotAWord
&& mIsBlacklistEntry == w.mIsBlacklistEntry;
}
@Override

View File

@ -43,6 +43,10 @@ class BinaryFormat {
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
// Flag for bigram presence
static const int FLAG_HAS_BIGRAMS = 0x04;
// Flag for non-words (typically, shortcut only entries)
static const int FLAG_IS_NOT_A_WORD = 0x02;
// Flag for blacklist
static const int FLAG_IS_BLACKLISTED = 0x01;
// Attribute (bigram/shortcut) related flags:
// Flag for presence of more attributes

View File

@ -72,6 +72,10 @@ class TerminalAttributes {
return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
}
bool isBlacklistedOrNotAWord() const {
return mFlags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD);
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes);
const uint8_t *const mDict;

View File

@ -391,9 +391,11 @@ inline void UnigramDictionary::onTerminal(const int probability,
const int finalProbability =
correction->getFinalProbability(probability, &wordPointer, &wordLength);
if (0 != finalProbability) {
if (0 != finalProbability && !terminalAttributes.isBlacklistedOrNotAWord()) {
// If the probability is 0, we don't want to add this word. However we still
// want to add its shortcuts (including a possible whitelist entry) if any.
// Furthermore, if this is not a word (shortcut only for example) or a blacklisted
// entry then we never want to suggest this.
addWord(wordPointer, wordLength, finalProbability, masterQueue,
Dictionary::KIND_CORRECTION);
}
@ -841,6 +843,12 @@ int UnigramDictionary::getFrequency(const int32_t *const inWord, const int lengt
return NOT_A_PROBABILITY;
}
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) {
// If this is not a word, or if it's a blacklisted entry, it should behave as
// having no frequency outside of the suggestion process (where it should be used
// for shortcuts).
return NOT_A_PROBABILITY;
}
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
if (hasMultipleChars) {
pos = BinaryFormat::skipOtherCharacters(root, pos);

View File

@ -80,7 +80,7 @@ public class BinaryDictIOTests extends AndroidTestCase {
final List<String> words) {
for (int i = 0; i < number; ++i) {
final String word = words.get(i);
dict.add(word, UNIGRAM_FREQ, null);
dict.add(word, UNIGRAM_FREQ, null, false /* isNotAWord */);
}
}

View File

@ -31,16 +31,16 @@ public class FusionDictionaryTests extends AndroidTestCase {
FusionDictionary dict = new FusionDictionary(new Node(),
new FusionDictionary.DictionaryOptions(new HashMap<String,String>(), false, false));
dict.add("abc", 10, null);
dict.add("abc", 10, null, false /* isNotAWord */);
assertNull(FusionDictionary.findWordInTree(dict.mRoot, "aaa"));
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "abc"));
dict.add("aa", 10, null);
dict.add("aa", 10, null, false /* isNotAWord */);
assertNull(FusionDictionary.findWordInTree(dict.mRoot, "aaa"));
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "aa"));
dict.add("babcd", 10, null);
dict.add("bacde", 10, null);
dict.add("babcd", 10, null, false /* isNotAWord */);
dict.add("bacde", 10, null, false /* isNotAWord */);
assertNull(FusionDictionary.findWordInTree(dict.mRoot, "ba"));
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "babcd"));
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "bacde"));

View File

@ -50,6 +50,7 @@ public class XmlDictInputOutput {
private static final String SHORTCUT_TAG = "shortcut";
private static final String FREQUENCY_ATTR = "f";
private static final String WORD_ATTR = "word";
private static final String NOT_A_WORD_ATTR = "not_a_word";
private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1;
@ -92,7 +93,7 @@ public class XmlDictInputOutput {
final FusionDictionary dict = mDictionary;
for (final String shortcutOnly : mShortcutsMap.keySet()) {
if (dict.hasWord(shortcutOnly)) continue;
dict.add(shortcutOnly, 0, mShortcutsMap.get(shortcutOnly));
dict.add(shortcutOnly, 0, mShortcutsMap.get(shortcutOnly), true /* isNotAWord */);
}
mDictionary = null;
mShortcutsMap.clear();
@ -144,7 +145,7 @@ public class XmlDictInputOutput {
@Override
public void endElement(String uri, String localName, String qName) {
if (WORD == mState) {
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord));
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), false /* isNotAWord */);
mState = START;
}
}
@ -345,7 +346,8 @@ public class XmlDictInputOutput {
destination.write("<!-- Warning: there is no code to read this format yet. -->\n");
for (Word word : set) {
destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" "
+ FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">");
+ FREQUENCY_ATTR + "=\"" + word.mFrequency
+ (word.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") + "\">");
if (null != word.mShortcutTargets) {
destination.write("\n");
for (WeightedString target : word.mShortcutTargets) {

View File

@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase {
final FusionDictionary dict = new FusionDictionary(new Node(),
new DictionaryOptions(new HashMap<String, String>(),
false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */));
dict.add("foo", 1, null);
dict.add("fta", 1, null);
dict.add("ftb", 1, null);
dict.add("bar", 1, null);
dict.add("fool", 1, null);
dict.add("foo", 1, null, false /* isNotAWord */);
dict.add("fta", 1, null, false /* isNotAWord */);
dict.add("ftb", 1, null, false /* isNotAWord */);
dict.add("bar", 1, null, false /* isNotAWord */);
dict.add("fool", 1, null, false /* isNotAWord */);
final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
assertEquals(4, result.size());
while (!result.isEmpty()) {