diff --git a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java index cdf5247de..8a509be48 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java @@ -172,12 +172,12 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { // considering performance regression. protected void addWord(final String word, final String shortcutTarget, final int frequency) { if (shortcutTarget == null) { - mFusionDictionary.add(word, frequency, null); + mFusionDictionary.add(word, frequency, null, false /* isNotAWord */); } else { // TODO: Do this in the subclass, with this class taking an arraylist. final ArrayList shortcutTargets = CollectionUtils.newArrayList(); shortcutTargets.add(new WeightedString(shortcutTarget, frequency)); - mFusionDictionary.add(word, frequency, shortcutTargets); + mFusionDictionary.add(word, frequency, shortcutTargets, false /* isNotAWord */); } } diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index d4f7cab5c..0c6b9c319 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -55,6 +55,8 @@ public class BinaryDictInputOutput { * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS + * | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD + * | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED * * c | IF FLAG_HAS_MULTIPLE_CHARS * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers @@ -153,6 +155,8 @@ public class BinaryDictInputOutput { private static final int FLAG_IS_TERMINAL = 0x10; private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; private static final int FLAG_HAS_BIGRAMS = 0x04; + private static final int FLAG_IS_NOT_A_WORD = 0x02; + private static final int FLAG_IS_BLACKLISTED = 0x01; private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; @@ -778,6 +782,12 @@ public class BinaryDictInputOutput { } flags |= FLAG_HAS_BIGRAMS; } + if (group.mIsNotAWord) { + flags |= FLAG_IS_NOT_A_WORD; + } + if (group.mIsBlacklistEntry) { + flags |= FLAG_IS_BLACKLISTED; + } return flags; } @@ -1352,12 +1362,14 @@ public class BinaryDictInputOutput { buffer.position(currentPosition); } nodeContents.add( - new CharGroup(info.mCharacters, shortcutTargets, - bigrams, info.mFrequency, children)); + new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, + 0 != (info.mFlags & FLAG_IS_NOT_A_WORD), + 0 != (info.mFlags & FLAG_IS_BLACKLISTED), children)); } else { nodeContents.add( - new CharGroup(info.mCharacters, shortcutTargets, - bigrams, info.mFrequency)); + new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, + 0 != (info.mFlags & FLAG_IS_NOT_A_WORD), + 0 != (info.mFlags & FLAG_IS_BLACKLISTED))); } groupOffset = info.mEndAddress; } @@ -1478,7 +1490,11 @@ public class BinaryDictInputOutput { 0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG))); if (null != dict) { for (final Word w : dict) { - newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets); + if (w.mIsBlacklistEntry) { + newDict.addBlacklistEntry(w.mWord, w.mShortcutTargets, w.mIsNotAWord); + } else { + newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mIsNotAWord); + } } for (final Word w : dict) { // By construction a binary dictionary may not have bigrams pointing to diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index 7c15ba54d..f1abea9ec 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -101,26 +101,34 @@ public class FusionDictionary implements Iterable { ArrayList mBigrams; int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. Node mChildren; + boolean mIsNotAWord; // Only a shortcut + boolean mIsBlacklistEntry; // The two following members to help with binary generation int mCachedSize; int mCachedAddress; public CharGroup(final int[] chars, final ArrayList shortcutTargets, - final ArrayList bigrams, final int frequency) { + final ArrayList bigrams, final int frequency, + final boolean isNotAWord, final boolean isBlacklistEntry) { mChars = chars; mFrequency = frequency; mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = null; + mIsNotAWord = isNotAWord; + mIsBlacklistEntry = isBlacklistEntry; } public CharGroup(final int[] chars, final ArrayList shortcutTargets, - final ArrayList bigrams, final int frequency, final Node children) { + final ArrayList bigrams, final int frequency, + final boolean isNotAWord, final boolean isBlacklistEntry, final Node children) { mChars = chars; mFrequency = frequency; mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = children; + mIsNotAWord = isNotAWord; + mIsBlacklistEntry = isBlacklistEntry; } public void addChild(CharGroup n) { @@ -197,8 +205,9 @@ public class FusionDictionary implements Iterable { * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only * updated if they are higher than the existing ones. */ - public void update(int frequency, ArrayList shortcutTargets, - ArrayList bigrams) { + public void update(final int frequency, final ArrayList shortcutTargets, + final ArrayList bigrams, + final boolean isNotAWord, final boolean isBlacklistEntry) { if (frequency > mFrequency) { mFrequency = frequency; } @@ -234,6 +243,8 @@ public class FusionDictionary implements Iterable { } } } + mIsNotAWord = isNotAWord; + mIsBlacklistEntry = isBlacklistEntry; } } @@ -296,10 +307,24 @@ public class FusionDictionary implements Iterable { * @param word the word to add. * @param frequency the frequency of the word, in the range [0..255]. * @param shortcutTargets a list of shortcut targets for this word, or null. + * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) */ public void add(final String word, final int frequency, - final ArrayList shortcutTargets) { - add(getCodePoints(word), frequency, shortcutTargets); + final ArrayList shortcutTargets, final boolean isNotAWord) { + add(getCodePoints(word), frequency, shortcutTargets, isNotAWord, + false /* isBlacklistEntry */); + } + + /** + * Helper method to add a blacklist entry as a string. + * + * @param word the word to add as a blacklist entry. + * @param shortcutTargets a list of shortcut targets for this word, or null. + * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) + */ + public void addBlacklistEntry(final String word, + final ArrayList shortcutTargets, final boolean isNotAWord) { + add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */); } /** @@ -332,7 +357,8 @@ public class FusionDictionary implements Iterable { if (charGroup != null) { final CharGroup charGroup2 = findWordInTree(mRoot, word2); if (charGroup2 == null) { - add(getCodePoints(word2), 0, null); + add(getCodePoints(word2), 0, null, false /* isNotAWord */, + false /* isBlacklistEntry */); } charGroup.addBigram(word2, frequency); } else { @@ -349,9 +375,12 @@ public class FusionDictionary implements Iterable { * @param word the word, as an int array. * @param frequency the frequency of the word, in the range [0..255]. * @param shortcutTargets an optional list of shortcut targets for this word (null if none). + * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) + * @param isBlacklistEntry true if this is a blacklisted word, false otherwise */ private void add(final int[] word, final int frequency, - final ArrayList shortcutTargets) { + final ArrayList shortcutTargets, + final boolean isNotAWord, final boolean isBlacklistEntry) { assert(frequency >= 0 && frequency <= 255); Node currentNode = mRoot; int charIndex = 0; @@ -376,7 +405,7 @@ public class FusionDictionary implements Iterable { final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); final CharGroup newGroup = new CharGroup( Arrays.copyOfRange(word, charIndex, word.length), - shortcutTargets, null /* bigrams */, frequency); + shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); currentNode.mData.add(insertionIndex, newGroup); if (DBG) checkStack(currentNode); } else { @@ -386,13 +415,15 @@ public class FusionDictionary implements Iterable { // The new word is a prefix of an existing word, but the node on which it // should end already exists as is. Since the old CharNode was not a terminal, // make it one by filling in its frequency and other attributes - currentGroup.update(frequency, shortcutTargets, null); + currentGroup.update(frequency, shortcutTargets, null, isNotAWord, + isBlacklistEntry); } else { // The new word matches the full old word and extends past it. // We only have to create a new node and add it to the end of this. final CharGroup newNode = new CharGroup( Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), - shortcutTargets, null /* bigrams */, frequency); + shortcutTargets, null /* bigrams */, frequency, isNotAWord, + isBlacklistEntry); currentGroup.mChildren = new Node(); currentGroup.mChildren.mData.add(newNode); } @@ -400,7 +431,9 @@ public class FusionDictionary implements Iterable { if (0 == differentCharIndex) { // Exact same word. Update the frequency if higher. This will also add the // new shortcuts to the existing shortcut list if it already exists. - currentGroup.update(frequency, shortcutTargets, null); + currentGroup.update(frequency, shortcutTargets, null, + currentGroup.mIsNotAWord && isNotAWord, + currentGroup.mIsBlacklistEntry || isBlacklistEntry); } else { // Partial prefix match only. We have to replace the current node with a node // containing the current prefix and create two new ones for the tails. @@ -408,21 +441,26 @@ public class FusionDictionary implements Iterable { final CharGroup newOldWord = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, currentGroup.mChars.length), currentGroup.mShortcutTargets, - currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren); + currentGroup.mBigrams, currentGroup.mFrequency, + currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry, + currentGroup.mChildren); newChildren.mData.add(newOldWord); final CharGroup newParent; if (charIndex + differentCharIndex >= word.length) { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - shortcutTargets, null /* bigrams */, frequency, newChildren); + shortcutTargets, null /* bigrams */, frequency, + isNotAWord, isBlacklistEntry, newChildren); } else { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - null /* shortcutTargets */, null /* bigrams */, -1, newChildren); + null /* shortcutTargets */, null /* bigrams */, -1, + false /* isNotAWord */, false /* isBlacklistEntry */, newChildren); final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), - shortcutTargets, null /* bigrams */, frequency); + shortcutTargets, null /* bigrams */, frequency, + isNotAWord, isBlacklistEntry); final int addIndex = word[charIndex + differentCharIndex] > currentGroup.mChars[differentCharIndex] ? 1 : 0; newChildren.mData.add(addIndex, newWord); @@ -483,7 +521,8 @@ public class FusionDictionary implements Iterable { private static int findInsertionIndex(final Node node, int character) { final ArrayList data = node.mData; final CharGroup reference = new CharGroup(new int[] { character }, - null /* shortcutTargets */, null /* bigrams */, 0); + null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */, + false /* isBlacklistEntry */); int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); return result >= 0 ? result : -result - 1; } @@ -748,7 +787,8 @@ public class FusionDictionary implements Iterable { } if (currentGroup.mFrequency >= 0) return new Word(mCurrentString.toString(), currentGroup.mFrequency, - currentGroup.mShortcutTargets, currentGroup.mBigrams); + currentGroup.mShortcutTargets, currentGroup.mBigrams, + currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry); } else { mPositions.removeLast(); currentPos = mPositions.getLast(); diff --git a/java/src/com/android/inputmethod/latin/makedict/Word.java b/java/src/com/android/inputmethod/latin/makedict/Word.java index 65fc72c40..4683ef154 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Word.java +++ b/java/src/com/android/inputmethod/latin/makedict/Word.java @@ -31,16 +31,21 @@ public class Word implements Comparable { public final int mFrequency; public final ArrayList mShortcutTargets; public final ArrayList mBigrams; + public final boolean mIsNotAWord; + public final boolean mIsBlacklistEntry; private int mHashCode = 0; public Word(final String word, final int frequency, final ArrayList shortcutTargets, - final ArrayList bigrams) { + final ArrayList bigrams, + final boolean isNotAWord, final boolean isBlacklistEntry) { mWord = word; mFrequency = frequency; mShortcutTargets = shortcutTargets; mBigrams = bigrams; + mIsNotAWord = isNotAWord; + mIsBlacklistEntry = isBlacklistEntry; } private static int computeHashCode(Word word) { @@ -48,7 +53,9 @@ public class Word implements Comparable { word.mWord, word.mFrequency, word.mShortcutTargets.hashCode(), - word.mBigrams.hashCode() + word.mBigrams.hashCode(), + word.mIsNotAWord, + word.mIsBlacklistEntry }); } @@ -78,7 +85,9 @@ public class Word implements Comparable { Word w = (Word)o; return mFrequency == w.mFrequency && mWord.equals(w.mWord) && mShortcutTargets.equals(w.mShortcutTargets) - && mBigrams.equals(w.mBigrams); + && mBigrams.equals(w.mBigrams) + && mIsNotAWord == w.mIsNotAWord + && mIsBlacklistEntry == w.mIsBlacklistEntry; } @Override diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index d8f3e83dd..25d504bfb 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -43,6 +43,10 @@ class BinaryFormat { static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; // Flag for bigram presence static const int FLAG_HAS_BIGRAMS = 0x04; + // Flag for non-words (typically, shortcut only entries) + static const int FLAG_IS_NOT_A_WORD = 0x02; + // Flag for blacklist + static const int FLAG_IS_BLACKLISTED = 0x01; // Attribute (bigram/shortcut) related flags: // Flag for presence of more attributes diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h index 34ab8f0ef..9ff2772b1 100644 --- a/native/jni/src/terminal_attributes.h +++ b/native/jni/src/terminal_attributes.h @@ -72,6 +72,10 @@ class TerminalAttributes { return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags); } + bool isBlacklistedOrNotAWord() const { + return mFlags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD); + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes); const uint8_t *const mDict; diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index ba3c2db6b..d4c51df63 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -391,9 +391,11 @@ inline void UnigramDictionary::onTerminal(const int probability, const int finalProbability = correction->getFinalProbability(probability, &wordPointer, &wordLength); - if (0 != finalProbability) { + if (0 != finalProbability && !terminalAttributes.isBlacklistedOrNotAWord()) { // If the probability is 0, we don't want to add this word. However we still // want to add its shortcuts (including a possible whitelist entry) if any. + // Furthermore, if this is not a word (shortcut only for example) or a blacklisted + // entry then we never want to suggest this. addWord(wordPointer, wordLength, finalProbability, masterQueue, Dictionary::KIND_CORRECTION); } @@ -841,6 +843,12 @@ int UnigramDictionary::getFrequency(const int32_t *const inWord, const int lengt return NOT_A_PROBABILITY; } const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); + if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) { + // If this is not a word, or if it's a blacklisted entry, it should behave as + // having no frequency outside of the suggestion process (where it should be used + // for shortcuts). + return NOT_A_PROBABILITY; + } const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); if (hasMultipleChars) { pos = BinaryFormat::skipOtherCharacters(root, pos); diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java index 0094db8a7..6c8e1ca4d 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictIOTests.java @@ -80,7 +80,7 @@ public class BinaryDictIOTests extends AndroidTestCase { final List words) { for (int i = 0; i < number; ++i) { final String word = words.get(i); - dict.add(word, UNIGRAM_FREQ, null); + dict.add(word, UNIGRAM_FREQ, null, false /* isNotAWord */); } } diff --git a/tests/src/com/android/inputmethod/latin/FusionDictionaryTests.java b/tests/src/com/android/inputmethod/latin/FusionDictionaryTests.java index 8ecdcc366..123959b4d 100644 --- a/tests/src/com/android/inputmethod/latin/FusionDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/FusionDictionaryTests.java @@ -31,16 +31,16 @@ public class FusionDictionaryTests extends AndroidTestCase { FusionDictionary dict = new FusionDictionary(new Node(), new FusionDictionary.DictionaryOptions(new HashMap(), false, false)); - dict.add("abc", 10, null); + dict.add("abc", 10, null, false /* isNotAWord */); assertNull(FusionDictionary.findWordInTree(dict.mRoot, "aaa")); assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "abc")); - dict.add("aa", 10, null); + dict.add("aa", 10, null, false /* isNotAWord */); assertNull(FusionDictionary.findWordInTree(dict.mRoot, "aaa")); assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "aa")); - dict.add("babcd", 10, null); - dict.add("bacde", 10, null); + dict.add("babcd", 10, null, false /* isNotAWord */); + dict.add("bacde", 10, null, false /* isNotAWord */); assertNull(FusionDictionary.findWordInTree(dict.mRoot, "ba")); assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "babcd")); assertNotNull(FusionDictionary.findWordInTree(dict.mRoot, "bacde")); diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java index 9ce8c4934..c31cd724a 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java @@ -50,6 +50,7 @@ public class XmlDictInputOutput { private static final String SHORTCUT_TAG = "shortcut"; private static final String FREQUENCY_ATTR = "f"; private static final String WORD_ATTR = "word"; + private static final String NOT_A_WORD_ATTR = "not_a_word"; private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1; @@ -92,7 +93,7 @@ public class XmlDictInputOutput { final FusionDictionary dict = mDictionary; for (final String shortcutOnly : mShortcutsMap.keySet()) { if (dict.hasWord(shortcutOnly)) continue; - dict.add(shortcutOnly, 0, mShortcutsMap.get(shortcutOnly)); + dict.add(shortcutOnly, 0, mShortcutsMap.get(shortcutOnly), true /* isNotAWord */); } mDictionary = null; mShortcutsMap.clear(); @@ -144,7 +145,7 @@ public class XmlDictInputOutput { @Override public void endElement(String uri, String localName, String qName) { if (WORD == mState) { - mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord)); + mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), false /* isNotAWord */); mState = START; } } @@ -345,7 +346,8 @@ public class XmlDictInputOutput { destination.write("\n"); for (Word word : set) { destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" " - + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">"); + + FREQUENCY_ATTR + "=\"" + word.mFrequency + + (word.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") + "\">"); if (null != word.mShortcutTargets) { destination.write("\n"); for (WeightedString target : word.mShortcutTargets) { diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java index 24042f120..88589b815 100644 --- a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java +++ b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java @@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase { final FusionDictionary dict = new FusionDictionary(new Node(), new DictionaryOptions(new HashMap(), false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */)); - dict.add("foo", 1, null); - dict.add("fta", 1, null); - dict.add("ftb", 1, null); - dict.add("bar", 1, null); - dict.add("fool", 1, null); + dict.add("foo", 1, null, false /* isNotAWord */); + dict.add("fta", 1, null, false /* isNotAWord */); + dict.add("ftb", 1, null, false /* isNotAWord */); + dict.add("bar", 1, null, false /* isNotAWord */); + dict.add("fool", 1, null, false /* isNotAWord */); final ArrayList result = BinaryDictInputOutput.flattenTree(dict.mRoot); assertEquals(4, result.size()); while (!result.isEmpty()) {