Merge "Ignore bigrams that are not also listed as unigrams" into jb-dev

2012-04-26 00:26:04 -07:00 · 2012-04-26 00:26:04 -07:00 · 329c8d7bcc
commit 329c8d7bcc
parent 604599c389 44c64f46a1
5 changed files with 51 additions and 41 deletions
--- a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java
+++ b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java
@ -159,7 +159,7 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
    // TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries,
    // considering performance regression.
    protected void addWord(final String word, final int frequency) {
-        mFusionDictionary.add(word, frequency, null, null);
+        mFusionDictionary.add(word, frequency, null /* shortcutTargets */);
    }

    /**
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
@ -1317,8 +1317,16 @@ public class BinaryDictInputOutput {
                        0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG),
                        0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
        if (null != dict) {
-            for (Word w : dict) {
-                newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams);
+            for (final Word w : dict) {
+                newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets);
+            }
+            for (final Word w : dict) {
+                // By construction a binary dictionary may not have bigrams pointing to
+                // words that are not also registered as unigrams so we don't have to avoid
+                // them explicitly here.
+                for (final WeightedString bigram : w.mBigrams) {
+                    newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency);
+                }
            }
        }

--- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
+++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
@ -286,7 +286,7 @@ public class FusionDictionary implements Iterable<Word> {
            for (WeightedString word : words) {
                final CharGroup t = findWordInTree(mRoot, word.mWord);
                if (null == t) {
-                    add(getCodePoints(word.mWord), 0, null, null);
+                    add(getCodePoints(word.mWord), 0, null);
                }
            }
        }
@ -305,12 +305,8 @@ public class FusionDictionary implements Iterable<Word> {
     * @param bigrams a list of bigrams, or null.
     */
    public void add(final String word, final int frequency,
-            final ArrayList<WeightedString> shortcutTargets,
-            final ArrayList<WeightedString> bigrams) {
-        if (null != bigrams) {
-            addNeutralWords(bigrams);
-        }
-        add(getCodePoints(word), frequency, shortcutTargets, bigrams);
+            final ArrayList<WeightedString> shortcutTargets) {
+        add(getCodePoints(word), frequency, shortcutTargets);
    }

    /**
@ -344,7 +340,7 @@ public class FusionDictionary implements Iterable<Word> {
            final CharGroup charGroup2 = findWordInTree(mRoot, word2);
            if (charGroup2 == null) {
                // TODO: refactor with the identical code in addNeutralWords
-                add(getCodePoints(word2), 0, null, null);
+                add(getCodePoints(word2), 0, null);
            }
            charGroup.addBigram(word2, frequency);
        } else {
@ -355,17 +351,15 @@ public class FusionDictionary implements Iterable<Word> {
    /**
     * Add a word to this dictionary.
     *
-     * The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't,
+     * The shortcuts, if any, have to be in the dictionary already. If they aren't,
     * an exception is thrown.
     *
     * @param word the word, as an int array.
     * @param frequency the frequency of the word, in the range [0..255].
     * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
-     * @param bigrams an optional list of bigrams for this word (null if none).
     */
    private void add(final int[] word, final int frequency,
-            final ArrayList<WeightedString> shortcutTargets,
-            final ArrayList<WeightedString> bigrams) {
+            final ArrayList<WeightedString> shortcutTargets) {
        assert(frequency >= 0 && frequency <= 255);
        Node currentNode = mRoot;
        int charIndex = 0;
@ -390,7 +384,7 @@ public class FusionDictionary implements Iterable<Word> {
            final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
            final CharGroup newGroup = new CharGroup(
                    Arrays.copyOfRange(word, charIndex, word.length),
-                    shortcutTargets, bigrams, frequency);
+                    shortcutTargets, null /* bigrams */, frequency);
            currentNode.mData.add(insertionIndex, newGroup);
            checkStack(currentNode);
        } else {
@ -400,21 +394,21 @@ public class FusionDictionary implements Iterable<Word> {
                    // The new word is a prefix of an existing word, but the node on which it
                    // should end already exists as is. Since the old CharNode was not a terminal, 
                    // make it one by filling in its frequency and other attributes
-                    currentGroup.update(frequency, shortcutTargets, bigrams);
+                    currentGroup.update(frequency, shortcutTargets, null);
                } else {
                    // The new word matches the full old word and extends past it.
                    // We only have to create a new node and add it to the end of this.
                    final CharGroup newNode = new CharGroup(
                            Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
-                                    shortcutTargets, bigrams, frequency);
+                                    shortcutTargets, null /* bigrams */, frequency);
                    currentGroup.mChildren = new Node();
                    currentGroup.mChildren.mData.add(newNode);
                }
            } else {
                if (0 == differentCharIndex) {
                    // Exact same word. Update the frequency if higher. This will also add the
-                    // new bigrams to the existing bigram list if it already exists.
-                    currentGroup.update(frequency, shortcutTargets, bigrams);
+                    // new shortcuts to the existing shortcut list if it already exists.
+                    currentGroup.update(frequency, shortcutTargets, null);
                } else {
                    // Partial prefix match only. We have to replace the current node with a node
                    // containing the current prefix and create two new ones for the tails.
@ -429,14 +423,14 @@ public class FusionDictionary implements Iterable<Word> {
                    if (charIndex + differentCharIndex >= word.length) {
                        newParent = new CharGroup(
                                Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
-                                shortcutTargets, bigrams, frequency, newChildren);
+                                shortcutTargets, null /* bigrams */, frequency, newChildren);
                    } else {
                        newParent = new CharGroup(
                                Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
-                                null, null, -1, newChildren);
-                        final CharGroup newWord = new CharGroup(
-                                Arrays.copyOfRange(word, charIndex + differentCharIndex,
-                                        word.length), shortcutTargets, bigrams, frequency);
+                                null /* shortcutTargets */, null /* bigrams */, -1, newChildren);
+                        final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
+                                charIndex + differentCharIndex, word.length),
+                                shortcutTargets, null /* bigrams */, frequency);
                        final int addIndex = word[charIndex + differentCharIndex]
                                > currentGroup.mChars[differentCharIndex] ? 1 : 0;
                        newChildren.mData.add(addIndex, newWord);
@ -494,7 +488,8 @@ public class FusionDictionary implements Iterable<Word> {
     */
    private static int findInsertionIndex(final Node node, int character) {
        final ArrayList<CharGroup> data = node.mData;
-        final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0);
+        final CharGroup reference = new CharGroup(new int[] { character },
+                null /* shortcutTargets */, null /* bigrams */, 0);
        int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
        return result >= 0 ? result : -result - 1;
    }
--- a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java
+++ b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java
@ -72,19 +72,15 @@ public class XmlDictInputOutput {
        int mFreq; // the currently read freq
        String mWord; // the current word
        final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
-        final HashMap<String, ArrayList<WeightedString>> mBigramsMap;

        /**
         * Create the handler.
         *
         * @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
-         * @param bigrams the bigrams as a map. This may be empty, but may not be null.
         */
-        public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts,
-                final HashMap<String, ArrayList<WeightedString>> bigrams) {
+        public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
            mDictionary = null;
            mShortcutsMap = shortcuts;
-            mBigramsMap = bigrams;
            mWord = "";
            mState = START;
            mFreq = 0;
@ -94,7 +90,6 @@ public class XmlDictInputOutput {
            final FusionDictionary dict = mDictionary;
            mDictionary = null;
            mShortcutsMap.clear();
-            mBigramsMap.clear();
            mWord = "";
            mState = START;
            mFreq = 0;
@ -143,7 +138,7 @@ public class XmlDictInputOutput {
        @Override
        public void endElement(String uri, String localName, String qName) {
            if (WORD == mState) {
-                mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord));
+                mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord));
                mState = START;
            }
        }
@ -191,6 +186,7 @@ public class XmlDictInputOutput {
            }
        }

+        // This may return an empty map, but will never return null.
        public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
            return mAssocMap;
        }
@ -211,6 +207,7 @@ public class XmlDictInputOutput {
                    BIGRAM_FREQ_ATTRIBUTE);
        }

+        // As per getAssocMap(), this never returns null.
        public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
            return getAssocMap();
        }
@ -231,6 +228,7 @@ public class XmlDictInputOutput {
                    TARGET_PRIORITY_ATTRIBUTE);
        }

+        // As per getAssocMap(), this never returns null.
        public HashMap<String, ArrayList<WeightedString>> getShortcutMap() {
            return getAssocMap();
        }
@ -260,10 +258,19 @@ public class XmlDictInputOutput {
        if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);

        final UnigramHandler unigramHandler =
-                new UnigramHandler(shortcutHandler.getShortcutMap(),
-                        bigramHandler.getBigramMap());
+                new UnigramHandler(shortcutHandler.getShortcutMap());
        parser.parse(unigrams, unigramHandler);
-        return unigramHandler.getFinalDictionary();
+        final FusionDictionary dict = unigramHandler.getFinalDictionary();
+        final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
+        for (final String firstWord : bigramMap.keySet()) {
+            if (!dict.hasWord(firstWord)) continue;
+            final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
+            for (final WeightedString bigram : bigramList) {
+                if (!dict.hasWord(bigram.mWord)) continue;
+                dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency);
+            }
+        }
+        return dict;
    }

    /**
--- a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java
+++ b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java
@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase {
        final FusionDictionary dict = new FusionDictionary(new Node(),
                new DictionaryOptions(new HashMap<String, String>(),
                        false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */));
-        dict.add("foo", 1, null, null);
-        dict.add("fta", 1, null, null);
-        dict.add("ftb", 1, null, null);
-        dict.add("bar", 1, null, null);
-        dict.add("fool", 1, null, null);
+        dict.add("foo", 1, null);
+        dict.add("fta", 1, null);
+        dict.add("ftb", 1, null);
+        dict.add("bar", 1, null);
+        dict.add("fool", 1, null);
        final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
        assertEquals(4, result.size());
        while (!result.isEmpty()) {