am 2035b946: Merge "Reinstate the shortcut-only attribute" into jb-mr1-dev
* commit '2035b946a38dd072119a9771252aef3667f08f68': Reinstate the shortcut-only attributemain
commit
5bf2f9f8cf
|
@ -172,12 +172,12 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
|
||||||
// considering performance regression.
|
// considering performance regression.
|
||||||
protected void addWord(final String word, final String shortcutTarget, final int frequency) {
|
protected void addWord(final String word, final String shortcutTarget, final int frequency) {
|
||||||
if (shortcutTarget == null) {
|
if (shortcutTarget == null) {
|
||||||
mFusionDictionary.add(word, frequency, null);
|
mFusionDictionary.add(word, frequency, null, false /* isNotAWord */);
|
||||||
} else {
|
} else {
|
||||||
// TODO: Do this in the subclass, with this class taking an arraylist.
|
// TODO: Do this in the subclass, with this class taking an arraylist.
|
||||||
final ArrayList<WeightedString> shortcutTargets = CollectionUtils.newArrayList();
|
final ArrayList<WeightedString> shortcutTargets = CollectionUtils.newArrayList();
|
||||||
shortcutTargets.add(new WeightedString(shortcutTarget, frequency));
|
shortcutTargets.add(new WeightedString(shortcutTarget, frequency));
|
||||||
mFusionDictionary.add(word, frequency, shortcutTargets);
|
mFusionDictionary.add(word, frequency, shortcutTargets, false /* isNotAWord */);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -56,6 +56,8 @@ public class BinaryDictInputOutput {
|
||||||
* s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
|
* s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
|
||||||
* | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
|
* | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
|
||||||
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
|
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
|
||||||
|
* | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD
|
||||||
|
* | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED
|
||||||
*
|
*
|
||||||
* c | IF FLAG_HAS_MULTIPLE_CHARS
|
* c | IF FLAG_HAS_MULTIPLE_CHARS
|
||||||
* h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
|
* h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
|
||||||
|
@ -154,6 +156,8 @@ public class BinaryDictInputOutput {
|
||||||
private static final int FLAG_IS_TERMINAL = 0x10;
|
private static final int FLAG_IS_TERMINAL = 0x10;
|
||||||
private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
||||||
private static final int FLAG_HAS_BIGRAMS = 0x04;
|
private static final int FLAG_HAS_BIGRAMS = 0x04;
|
||||||
|
private static final int FLAG_IS_NOT_A_WORD = 0x02;
|
||||||
|
private static final int FLAG_IS_BLACKLISTED = 0x01;
|
||||||
|
|
||||||
private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
|
private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
|
||||||
private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
||||||
|
@ -779,6 +783,12 @@ public class BinaryDictInputOutput {
|
||||||
}
|
}
|
||||||
flags |= FLAG_HAS_BIGRAMS;
|
flags |= FLAG_HAS_BIGRAMS;
|
||||||
}
|
}
|
||||||
|
if (group.mIsNotAWord) {
|
||||||
|
flags |= FLAG_IS_NOT_A_WORD;
|
||||||
|
}
|
||||||
|
if (group.mIsBlacklistEntry) {
|
||||||
|
flags |= FLAG_IS_BLACKLISTED;
|
||||||
|
}
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1353,12 +1363,14 @@ public class BinaryDictInputOutput {
|
||||||
buffer.position(currentPosition);
|
buffer.position(currentPosition);
|
||||||
}
|
}
|
||||||
nodeContents.add(
|
nodeContents.add(
|
||||||
new CharGroup(info.mCharacters, shortcutTargets,
|
new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
|
||||||
bigrams, info.mFrequency, children));
|
0 != (info.mFlags & FLAG_IS_NOT_A_WORD),
|
||||||
|
0 != (info.mFlags & FLAG_IS_BLACKLISTED), children));
|
||||||
} else {
|
} else {
|
||||||
nodeContents.add(
|
nodeContents.add(
|
||||||
new CharGroup(info.mCharacters, shortcutTargets,
|
new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
|
||||||
bigrams, info.mFrequency));
|
0 != (info.mFlags & FLAG_IS_NOT_A_WORD),
|
||||||
|
0 != (info.mFlags & FLAG_IS_BLACKLISTED)));
|
||||||
}
|
}
|
||||||
groupOffset = info.mEndAddress;
|
groupOffset = info.mEndAddress;
|
||||||
}
|
}
|
||||||
|
@ -1574,7 +1586,11 @@ public class BinaryDictInputOutput {
|
||||||
0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
|
0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
|
||||||
if (null != dict) {
|
if (null != dict) {
|
||||||
for (final Word w : dict) {
|
for (final Word w : dict) {
|
||||||
newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets);
|
if (w.mIsBlacklistEntry) {
|
||||||
|
newDict.addBlacklistEntry(w.mWord, w.mShortcutTargets, w.mIsNotAWord);
|
||||||
|
} else {
|
||||||
|
newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mIsNotAWord);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (final Word w : dict) {
|
for (final Word w : dict) {
|
||||||
// By construction a binary dictionary may not have bigrams pointing to
|
// By construction a binary dictionary may not have bigrams pointing to
|
||||||
|
|
|
@ -101,26 +101,34 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
ArrayList<WeightedString> mBigrams;
|
ArrayList<WeightedString> mBigrams;
|
||||||
int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
|
int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
|
||||||
Node mChildren;
|
Node mChildren;
|
||||||
|
boolean mIsNotAWord; // Only a shortcut
|
||||||
|
boolean mIsBlacklistEntry;
|
||||||
// The two following members to help with binary generation
|
// The two following members to help with binary generation
|
||||||
int mCachedSize;
|
int mCachedSize;
|
||||||
int mCachedAddress;
|
int mCachedAddress;
|
||||||
|
|
||||||
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams, final int frequency) {
|
final ArrayList<WeightedString> bigrams, final int frequency,
|
||||||
|
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||||
mChars = chars;
|
mChars = chars;
|
||||||
mFrequency = frequency;
|
mFrequency = frequency;
|
||||||
mShortcutTargets = shortcutTargets;
|
mShortcutTargets = shortcutTargets;
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
mChildren = null;
|
mChildren = null;
|
||||||
|
mIsNotAWord = isNotAWord;
|
||||||
|
mIsBlacklistEntry = isBlacklistEntry;
|
||||||
}
|
}
|
||||||
|
|
||||||
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams, final int frequency, final Node children) {
|
final ArrayList<WeightedString> bigrams, final int frequency,
|
||||||
|
final boolean isNotAWord, final boolean isBlacklistEntry, final Node children) {
|
||||||
mChars = chars;
|
mChars = chars;
|
||||||
mFrequency = frequency;
|
mFrequency = frequency;
|
||||||
mShortcutTargets = shortcutTargets;
|
mShortcutTargets = shortcutTargets;
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
mChildren = children;
|
mChildren = children;
|
||||||
|
mIsNotAWord = isNotAWord;
|
||||||
|
mIsBlacklistEntry = isBlacklistEntry;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addChild(CharGroup n) {
|
public void addChild(CharGroup n) {
|
||||||
|
@ -197,8 +205,9 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
* the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
|
* the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
|
||||||
* updated if they are higher than the existing ones.
|
* updated if they are higher than the existing ones.
|
||||||
*/
|
*/
|
||||||
public void update(int frequency, ArrayList<WeightedString> shortcutTargets,
|
public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets,
|
||||||
ArrayList<WeightedString> bigrams) {
|
final ArrayList<WeightedString> bigrams,
|
||||||
|
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||||
if (frequency > mFrequency) {
|
if (frequency > mFrequency) {
|
||||||
mFrequency = frequency;
|
mFrequency = frequency;
|
||||||
}
|
}
|
||||||
|
@ -234,6 +243,8 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
mIsNotAWord = isNotAWord;
|
||||||
|
mIsBlacklistEntry = isBlacklistEntry;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -296,10 +307,24 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
* @param word the word to add.
|
* @param word the word to add.
|
||||||
* @param frequency the frequency of the word, in the range [0..255].
|
* @param frequency the frequency of the word, in the range [0..255].
|
||||||
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
||||||
|
* @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
|
||||||
*/
|
*/
|
||||||
public void add(final String word, final int frequency,
|
public void add(final String word, final int frequency,
|
||||||
final ArrayList<WeightedString> shortcutTargets) {
|
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
||||||
add(getCodePoints(word), frequency, shortcutTargets);
|
add(getCodePoints(word), frequency, shortcutTargets, isNotAWord,
|
||||||
|
false /* isBlacklistEntry */);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method to add a blacklist entry as a string.
|
||||||
|
*
|
||||||
|
* @param word the word to add as a blacklist entry.
|
||||||
|
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
||||||
|
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
|
||||||
|
*/
|
||||||
|
public void addBlacklistEntry(final String word,
|
||||||
|
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
||||||
|
add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -332,7 +357,8 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
if (charGroup != null) {
|
if (charGroup != null) {
|
||||||
final CharGroup charGroup2 = findWordInTree(mRoot, word2);
|
final CharGroup charGroup2 = findWordInTree(mRoot, word2);
|
||||||
if (charGroup2 == null) {
|
if (charGroup2 == null) {
|
||||||
add(getCodePoints(word2), 0, null);
|
add(getCodePoints(word2), 0, null, false /* isNotAWord */,
|
||||||
|
false /* isBlacklistEntry */);
|
||||||
}
|
}
|
||||||
charGroup.addBigram(word2, frequency);
|
charGroup.addBigram(word2, frequency);
|
||||||
} else {
|
} else {
|
||||||
|
@ -349,9 +375,12 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
* @param word the word, as an int array.
|
* @param word the word, as an int array.
|
||||||
* @param frequency the frequency of the word, in the range [0..255].
|
* @param frequency the frequency of the word, in the range [0..255].
|
||||||
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
||||||
|
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
|
||||||
|
* @param isBlacklistEntry true if this is a blacklisted word, false otherwise
|
||||||
*/
|
*/
|
||||||
private void add(final int[] word, final int frequency,
|
private void add(final int[] word, final int frequency,
|
||||||
final ArrayList<WeightedString> shortcutTargets) {
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
|
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||||
assert(frequency >= 0 && frequency <= 255);
|
assert(frequency >= 0 && frequency <= 255);
|
||||||
Node currentNode = mRoot;
|
Node currentNode = mRoot;
|
||||||
int charIndex = 0;
|
int charIndex = 0;
|
||||||
|
@ -376,7 +405,7 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
|
final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
|
||||||
final CharGroup newGroup = new CharGroup(
|
final CharGroup newGroup = new CharGroup(
|
||||||
Arrays.copyOfRange(word, charIndex, word.length),
|
Arrays.copyOfRange(word, charIndex, word.length),
|
||||||
shortcutTargets, null /* bigrams */, frequency);
|
shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry);
|
||||||
currentNode.mData.add(insertionIndex, newGroup);
|
currentNode.mData.add(insertionIndex, newGroup);
|
||||||
if (DBG) checkStack(currentNode);
|
if (DBG) checkStack(currentNode);
|
||||||
} else {
|
} else {
|
||||||
|
@ -386,13 +415,15 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
// The new word is a prefix of an existing word, but the node on which it
|
// The new word is a prefix of an existing word, but the node on which it
|
||||||
// should end already exists as is. Since the old CharNode was not a terminal,
|
// should end already exists as is. Since the old CharNode was not a terminal,
|
||||||
// make it one by filling in its frequency and other attributes
|
// make it one by filling in its frequency and other attributes
|
||||||
currentGroup.update(frequency, shortcutTargets, null);
|
currentGroup.update(frequency, shortcutTargets, null, isNotAWord,
|
||||||
|
isBlacklistEntry);
|
||||||
} else {
|
} else {
|
||||||
// The new word matches the full old word and extends past it.
|
// The new word matches the full old word and extends past it.
|
||||||
// We only have to create a new node and add it to the end of this.
|
// We only have to create a new node and add it to the end of this.
|
||||||
final CharGroup newNode = new CharGroup(
|
final CharGroup newNode = new CharGroup(
|
||||||
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
||||||
shortcutTargets, null /* bigrams */, frequency);
|
shortcutTargets, null /* bigrams */, frequency, isNotAWord,
|
||||||
|
isBlacklistEntry);
|
||||||
currentGroup.mChildren = new Node();
|
currentGroup.mChildren = new Node();
|
||||||
currentGroup.mChildren.mData.add(newNode);
|
currentGroup.mChildren.mData.add(newNode);
|
||||||
}
|
}
|
||||||
|
@ -400,7 +431,9 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
if (0 == differentCharIndex) {
|
if (0 == differentCharIndex) {
|
||||||
// Exact same word. Update the frequency if higher. This will also add the
|
// Exact same word. Update the frequency if higher. This will also add the
|
||||||
// new shortcuts to the existing shortcut list if it already exists.
|
// new shortcuts to the existing shortcut list if it already exists.
|
||||||
currentGroup.update(frequency, shortcutTargets, null);
|
currentGroup.update(frequency, shortcutTargets, null,
|
||||||
|
currentGroup.mIsNotAWord && isNotAWord,
|
||||||
|
currentGroup.mIsBlacklistEntry || isBlacklistEntry);
|
||||||
} else {
|
} else {
|
||||||
// Partial prefix match only. We have to replace the current node with a node
|
// Partial prefix match only. We have to replace the current node with a node
|
||||||
// containing the current prefix and create two new ones for the tails.
|
// containing the current prefix and create two new ones for the tails.
|
||||||
|
@ -408,21 +441,26 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
final CharGroup newOldWord = new CharGroup(
|
final CharGroup newOldWord = new CharGroup(
|
||||||
Arrays.copyOfRange(currentGroup.mChars, differentCharIndex,
|
Arrays.copyOfRange(currentGroup.mChars, differentCharIndex,
|
||||||
currentGroup.mChars.length), currentGroup.mShortcutTargets,
|
currentGroup.mChars.length), currentGroup.mShortcutTargets,
|
||||||
currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren);
|
currentGroup.mBigrams, currentGroup.mFrequency,
|
||||||
|
currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry,
|
||||||
|
currentGroup.mChildren);
|
||||||
newChildren.mData.add(newOldWord);
|
newChildren.mData.add(newOldWord);
|
||||||
|
|
||||||
final CharGroup newParent;
|
final CharGroup newParent;
|
||||||
if (charIndex + differentCharIndex >= word.length) {
|
if (charIndex + differentCharIndex >= word.length) {
|
||||||
newParent = new CharGroup(
|
newParent = new CharGroup(
|
||||||
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
||||||
shortcutTargets, null /* bigrams */, frequency, newChildren);
|
shortcutTargets, null /* bigrams */, frequency,
|
||||||
|
isNotAWord, isBlacklistEntry, newChildren);
|
||||||
} else {
|
} else {
|
||||||
newParent = new CharGroup(
|
newParent = new CharGroup(
|
||||||
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
||||||
null /* shortcutTargets */, null /* bigrams */, -1, newChildren);
|
null /* shortcutTargets */, null /* bigrams */, -1,
|
||||||
|
false /* isNotAWord */, false /* isBlacklistEntry */, newChildren);
|
||||||
final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
|
final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
|
||||||
charIndex + differentCharIndex, word.length),
|
charIndex + differentCharIndex, word.length),
|
||||||
shortcutTargets, null /* bigrams */, frequency);
|
shortcutTargets, null /* bigrams */, frequency,
|
||||||
|
isNotAWord, isBlacklistEntry);
|
||||||
final int addIndex = word[charIndex + differentCharIndex]
|
final int addIndex = word[charIndex + differentCharIndex]
|
||||||
> currentGroup.mChars[differentCharIndex] ? 1 : 0;
|
> currentGroup.mChars[differentCharIndex] ? 1 : 0;
|
||||||
newChildren.mData.add(addIndex, newWord);
|
newChildren.mData.add(addIndex, newWord);
|
||||||
|
@ -483,7 +521,8 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
private static int findInsertionIndex(final Node node, int character) {
|
private static int findInsertionIndex(final Node node, int character) {
|
||||||
final ArrayList<CharGroup> data = node.mData;
|
final ArrayList<CharGroup> data = node.mData;
|
||||||
final CharGroup reference = new CharGroup(new int[] { character },
|
final CharGroup reference = new CharGroup(new int[] { character },
|
||||||
null /* shortcutTargets */, null /* bigrams */, 0);
|
null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */,
|
||||||
|
false /* isBlacklistEntry */);
|
||||||
int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
|
int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
|
||||||
return result >= 0 ? result : -result - 1;
|
return result >= 0 ? result : -result - 1;
|
||||||
}
|
}
|
||||||
|
@ -748,7 +787,8 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
}
|
}
|
||||||
if (currentGroup.mFrequency >= 0)
|
if (currentGroup.mFrequency >= 0)
|
||||||
return new Word(mCurrentString.toString(), currentGroup.mFrequency,
|
return new Word(mCurrentString.toString(), currentGroup.mFrequency,
|
||||||
currentGroup.mShortcutTargets, currentGroup.mBigrams);
|
currentGroup.mShortcutTargets, currentGroup.mBigrams,
|
||||||
|
currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry);
|
||||||
} else {
|
} else {
|
||||||
mPositions.removeLast();
|
mPositions.removeLast();
|
||||||
currentPos = mPositions.getLast();
|
currentPos = mPositions.getLast();
|
||||||
|
|
|
@ -31,16 +31,21 @@ public class Word implements Comparable<Word> {
|
||||||
public final int mFrequency;
|
public final int mFrequency;
|
||||||
public final ArrayList<WeightedString> mShortcutTargets;
|
public final ArrayList<WeightedString> mShortcutTargets;
|
||||||
public final ArrayList<WeightedString> mBigrams;
|
public final ArrayList<WeightedString> mBigrams;
|
||||||
|
public final boolean mIsNotAWord;
|
||||||
|
public final boolean mIsBlacklistEntry;
|
||||||
|
|
||||||
private int mHashCode = 0;
|
private int mHashCode = 0;
|
||||||
|
|
||||||
public Word(final String word, final int frequency,
|
public Word(final String word, final int frequency,
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams) {
|
final ArrayList<WeightedString> bigrams,
|
||||||
|
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||||
mWord = word;
|
mWord = word;
|
||||||
mFrequency = frequency;
|
mFrequency = frequency;
|
||||||
mShortcutTargets = shortcutTargets;
|
mShortcutTargets = shortcutTargets;
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
|
mIsNotAWord = isNotAWord;
|
||||||
|
mIsBlacklistEntry = isBlacklistEntry;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int computeHashCode(Word word) {
|
private static int computeHashCode(Word word) {
|
||||||
|
@ -48,7 +53,9 @@ public class Word implements Comparable<Word> {
|
||||||
word.mWord,
|
word.mWord,
|
||||||
word.mFrequency,
|
word.mFrequency,
|
||||||
word.mShortcutTargets.hashCode(),
|
word.mShortcutTargets.hashCode(),
|
||||||
word.mBigrams.hashCode()
|
word.mBigrams.hashCode(),
|
||||||
|
word.mIsNotAWord,
|
||||||
|
word.mIsBlacklistEntry
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,7 +85,9 @@ public class Word implements Comparable<Word> {
|
||||||
Word w = (Word)o;
|
Word w = (Word)o;
|
||||||
return mFrequency == w.mFrequency && mWord.equals(w.mWord)
|
return mFrequency == w.mFrequency && mWord.equals(w.mWord)
|
||||||
&& mShortcutTargets.equals(w.mShortcutTargets)
|
&& mShortcutTargets.equals(w.mShortcutTargets)
|
||||||
&& mBigrams.equals(w.mBigrams);
|
&& mBigrams.equals(w.mBigrams)
|
||||||
|
&& mIsNotAWord == w.mIsNotAWord
|
||||||
|
&& mIsBlacklistEntry == w.mIsBlacklistEntry;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -43,6 +43,10 @@ class BinaryFormat {
|
||||||
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
||||||
// Flag for bigram presence
|
// Flag for bigram presence
|
||||||
static const int FLAG_HAS_BIGRAMS = 0x04;
|
static const int FLAG_HAS_BIGRAMS = 0x04;
|
||||||
|
// Flag for non-words (typically, shortcut only entries)
|
||||||
|
static const int FLAG_IS_NOT_A_WORD = 0x02;
|
||||||
|
// Flag for blacklist
|
||||||
|
static const int FLAG_IS_BLACKLISTED = 0x01;
|
||||||
|
|
||||||
// Attribute (bigram/shortcut) related flags:
|
// Attribute (bigram/shortcut) related flags:
|
||||||
// Flag for presence of more attributes
|
// Flag for presence of more attributes
|
||||||
|
|
|
@ -72,6 +72,10 @@ class TerminalAttributes {
|
||||||
return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
|
return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isBlacklistedOrNotAWord() const {
|
||||||
|
return mFlags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes);
|
||||||
const uint8_t *const mDict;
|
const uint8_t *const mDict;
|
||||||
|
|
|
@ -391,9 +391,11 @@ inline void UnigramDictionary::onTerminal(const int probability,
|
||||||
const int finalProbability =
|
const int finalProbability =
|
||||||
correction->getFinalProbability(probability, &wordPointer, &wordLength);
|
correction->getFinalProbability(probability, &wordPointer, &wordLength);
|
||||||
|
|
||||||
if (0 != finalProbability) {
|
if (0 != finalProbability && !terminalAttributes.isBlacklistedOrNotAWord()) {
|
||||||
// If the probability is 0, we don't want to add this word. However we still
|
// If the probability is 0, we don't want to add this word. However we still
|
||||||
// want to add its shortcuts (including a possible whitelist entry) if any.
|
// want to add its shortcuts (including a possible whitelist entry) if any.
|
||||||
|
// Furthermore, if this is not a word (shortcut only for example) or a blacklisted
|
||||||
|
// entry then we never want to suggest this.
|
||||||
addWord(wordPointer, wordLength, finalProbability, masterQueue,
|
addWord(wordPointer, wordLength, finalProbability, masterQueue,
|
||||||
Dictionary::KIND_CORRECTION);
|
Dictionary::KIND_CORRECTION);
|
||||||
}
|
}
|
||||||
|
@ -841,6 +843,12 @@ int UnigramDictionary::getFrequency(const int32_t *const inWord, const int lengt
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
|
if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) {
|
||||||
|
// If this is not a word, or if it's a blacklisted entry, it should behave as
|
||||||
|
// having no frequency outside of the suggestion process (where it should be used
|
||||||
|
// for shortcuts).
|
||||||
|
return NOT_A_PROBABILITY;
|
||||||
|
}
|
||||||
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
||||||
if (hasMultipleChars) {
|
if (hasMultipleChars) {
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
|
|
|
@ -89,7 +89,7 @@ public class BinaryDictIOTests extends AndroidTestCase {
|
||||||
final List<String> words) {
|
final List<String> words) {
|
||||||
for (int i = 0; i < number; ++i) {
|
for (int i = 0; i < number; ++i) {
|
||||||
final String word = words.get(i);
|
final String word = words.get(i);
|
||||||
dict.add(word, UNIGRAM_FREQ, null);
|
dict.add(word, UNIGRAM_FREQ, null, false /* isNotAWord */);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,16 +31,16 @@ public class FusionDictionaryTests extends AndroidTestCase {
|
||||||
FusionDictionary dict = new FusionDictionary(new Node(),
|
FusionDictionary dict = new FusionDictionary(new Node(),
|
||||||
new FusionDictionary.DictionaryOptions(new HashMap<String,String>(), false, false));
|
new FusionDictionary.DictionaryOptions(new HashMap<String,String>(), false, false));
|
||||||
|
|
||||||
dict.add("abc", 10, null);
|
dict.add("abc", 10, null, false /* isNotAWord */);
|
||||||
assertNull(FusionDictionary.findWordInTree(dict.mRoot, "aaa"));
|
assertNull(FusionDictionary.findWordInTree(dict.mRoot, "aaa"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "abc"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "abc"));
|
||||||
|
|
||||||
dict.add("aa", 10, null);
|
dict.add("aa", 10, null, false /* isNotAWord */);
|
||||||
assertNull(FusionDictionary.findWordInTree(dict.mRoot, "aaa"));
|
assertNull(FusionDictionary.findWordInTree(dict.mRoot, "aaa"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "aa"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "aa"));
|
||||||
|
|
||||||
dict.add("babcd", 10, null);
|
dict.add("babcd", 10, null, false /* isNotAWord */);
|
||||||
dict.add("bacde", 10, null);
|
dict.add("bacde", 10, null, false /* isNotAWord */);
|
||||||
assertNull(FusionDictionary.findWordInTree(dict.mRoot, "ba"));
|
assertNull(FusionDictionary.findWordInTree(dict.mRoot, "ba"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "babcd"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "babcd"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "bacde"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "bacde"));
|
||||||
|
|
|
@ -50,6 +50,7 @@ public class XmlDictInputOutput {
|
||||||
private static final String SHORTCUT_TAG = "shortcut";
|
private static final String SHORTCUT_TAG = "shortcut";
|
||||||
private static final String FREQUENCY_ATTR = "f";
|
private static final String FREQUENCY_ATTR = "f";
|
||||||
private static final String WORD_ATTR = "word";
|
private static final String WORD_ATTR = "word";
|
||||||
|
private static final String NOT_A_WORD_ATTR = "not_a_word";
|
||||||
|
|
||||||
private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1;
|
private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1;
|
||||||
|
|
||||||
|
@ -92,7 +93,7 @@ public class XmlDictInputOutput {
|
||||||
final FusionDictionary dict = mDictionary;
|
final FusionDictionary dict = mDictionary;
|
||||||
for (final String shortcutOnly : mShortcutsMap.keySet()) {
|
for (final String shortcutOnly : mShortcutsMap.keySet()) {
|
||||||
if (dict.hasWord(shortcutOnly)) continue;
|
if (dict.hasWord(shortcutOnly)) continue;
|
||||||
dict.add(shortcutOnly, 0, mShortcutsMap.get(shortcutOnly));
|
dict.add(shortcutOnly, 0, mShortcutsMap.get(shortcutOnly), true /* isNotAWord */);
|
||||||
}
|
}
|
||||||
mDictionary = null;
|
mDictionary = null;
|
||||||
mShortcutsMap.clear();
|
mShortcutsMap.clear();
|
||||||
|
@ -144,7 +145,7 @@ public class XmlDictInputOutput {
|
||||||
@Override
|
@Override
|
||||||
public void endElement(String uri, String localName, String qName) {
|
public void endElement(String uri, String localName, String qName) {
|
||||||
if (WORD == mState) {
|
if (WORD == mState) {
|
||||||
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord));
|
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), false /* isNotAWord */);
|
||||||
mState = START;
|
mState = START;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -345,7 +346,8 @@ public class XmlDictInputOutput {
|
||||||
destination.write("<!-- Warning: there is no code to read this format yet. -->\n");
|
destination.write("<!-- Warning: there is no code to read this format yet. -->\n");
|
||||||
for (Word word : set) {
|
for (Word word : set) {
|
||||||
destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" "
|
destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" "
|
||||||
+ FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">");
|
+ FREQUENCY_ATTR + "=\"" + word.mFrequency
|
||||||
|
+ (word.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") + "\">");
|
||||||
if (null != word.mShortcutTargets) {
|
if (null != word.mShortcutTargets) {
|
||||||
destination.write("\n");
|
destination.write("\n");
|
||||||
for (WeightedString target : word.mShortcutTargets) {
|
for (WeightedString target : word.mShortcutTargets) {
|
||||||
|
|
|
@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase {
|
||||||
final FusionDictionary dict = new FusionDictionary(new Node(),
|
final FusionDictionary dict = new FusionDictionary(new Node(),
|
||||||
new DictionaryOptions(new HashMap<String, String>(),
|
new DictionaryOptions(new HashMap<String, String>(),
|
||||||
false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */));
|
false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */));
|
||||||
dict.add("foo", 1, null);
|
dict.add("foo", 1, null, false /* isNotAWord */);
|
||||||
dict.add("fta", 1, null);
|
dict.add("fta", 1, null, false /* isNotAWord */);
|
||||||
dict.add("ftb", 1, null);
|
dict.add("ftb", 1, null, false /* isNotAWord */);
|
||||||
dict.add("bar", 1, null);
|
dict.add("bar", 1, null, false /* isNotAWord */);
|
||||||
dict.add("fool", 1, null);
|
dict.add("fool", 1, null, false /* isNotAWord */);
|
||||||
final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
|
final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
|
||||||
assertEquals(4, result.size());
|
assertEquals(4, result.size());
|
||||||
while (!result.isEmpty()) {
|
while (!result.isEmpty()) {
|
||||||
|
|
Loading…
Reference in New Issue