From 44c64f46a143623dd793facd889c8d6eab5e230c Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 20 Apr 2012 19:58:01 +0900 Subject: [PATCH] Ignore bigrams that are not also listed as unigrams This is a cherry pick of I14b67e51 on jb-dev Bug: 6340915 Change-Id: Iaa512abe1b19ca640ea201f9761fd7f1416270ed --- .../latin/ExpandableBinaryDictionary.java | 2 +- .../latin/makedict/BinaryDictInputOutput.java | 12 +++++- .../latin/makedict/FusionDictionary.java | 41 ++++++++----------- .../latin/makedict/XmlDictInputOutput.java | 27 +++++++----- .../latin/BinaryDictInputOutputTest.java | 10 ++--- 5 files changed, 51 insertions(+), 41 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java index 9dcffd4e2..3d89226c0 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java @@ -159,7 +159,7 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { // TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries, // considering performance regression. protected void addWord(final String word, final int frequency) { - mFusionDictionary.add(word, frequency, null, null); + mFusionDictionary.add(word, frequency, null /* shortcutTargets */); } /** diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index cc98010fb..88da7b0d8 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -1317,8 +1317,16 @@ public class BinaryDictInputOutput { 0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG), 0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG))); if (null != dict) { - for (Word w : dict) { - newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams); + for (final Word w : dict) { + newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets); + } + for (final Word w : dict) { + // By construction a binary dictionary may not have bigrams pointing to + // words that are not also registered as unigrams so we don't have to avoid + // them explicitly here. + for (final WeightedString bigram : w.mBigrams) { + newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency); + } } } diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index 40bcfc3aa..c293b2ba4 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -286,7 +286,7 @@ public class FusionDictionary implements Iterable { for (WeightedString word : words) { final CharGroup t = findWordInTree(mRoot, word.mWord); if (null == t) { - add(getCodePoints(word.mWord), 0, null, null); + add(getCodePoints(word.mWord), 0, null); } } } @@ -305,12 +305,8 @@ public class FusionDictionary implements Iterable { * @param bigrams a list of bigrams, or null. */ public void add(final String word, final int frequency, - final ArrayList shortcutTargets, - final ArrayList bigrams) { - if (null != bigrams) { - addNeutralWords(bigrams); - } - add(getCodePoints(word), frequency, shortcutTargets, bigrams); + final ArrayList shortcutTargets) { + add(getCodePoints(word), frequency, shortcutTargets); } /** @@ -344,7 +340,7 @@ public class FusionDictionary implements Iterable { final CharGroup charGroup2 = findWordInTree(mRoot, word2); if (charGroup2 == null) { // TODO: refactor with the identical code in addNeutralWords - add(getCodePoints(word2), 0, null, null); + add(getCodePoints(word2), 0, null); } charGroup.addBigram(word2, frequency); } else { @@ -355,17 +351,15 @@ public class FusionDictionary implements Iterable { /** * Add a word to this dictionary. * - * The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't, + * The shortcuts, if any, have to be in the dictionary already. If they aren't, * an exception is thrown. * * @param word the word, as an int array. * @param frequency the frequency of the word, in the range [0..255]. * @param shortcutTargets an optional list of shortcut targets for this word (null if none). - * @param bigrams an optional list of bigrams for this word (null if none). */ private void add(final int[] word, final int frequency, - final ArrayList shortcutTargets, - final ArrayList bigrams) { + final ArrayList shortcutTargets) { assert(frequency >= 0 && frequency <= 255); Node currentNode = mRoot; int charIndex = 0; @@ -390,7 +384,7 @@ public class FusionDictionary implements Iterable { final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); final CharGroup newGroup = new CharGroup( Arrays.copyOfRange(word, charIndex, word.length), - shortcutTargets, bigrams, frequency); + shortcutTargets, null /* bigrams */, frequency); currentNode.mData.add(insertionIndex, newGroup); checkStack(currentNode); } else { @@ -400,21 +394,21 @@ public class FusionDictionary implements Iterable { // The new word is a prefix of an existing word, but the node on which it // should end already exists as is. Since the old CharNode was not a terminal, // make it one by filling in its frequency and other attributes - currentGroup.update(frequency, shortcutTargets, bigrams); + currentGroup.update(frequency, shortcutTargets, null); } else { // The new word matches the full old word and extends past it. // We only have to create a new node and add it to the end of this. final CharGroup newNode = new CharGroup( Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), - shortcutTargets, bigrams, frequency); + shortcutTargets, null /* bigrams */, frequency); currentGroup.mChildren = new Node(); currentGroup.mChildren.mData.add(newNode); } } else { if (0 == differentCharIndex) { // Exact same word. Update the frequency if higher. This will also add the - // new bigrams to the existing bigram list if it already exists. - currentGroup.update(frequency, shortcutTargets, bigrams); + // new shortcuts to the existing shortcut list if it already exists. + currentGroup.update(frequency, shortcutTargets, null); } else { // Partial prefix match only. We have to replace the current node with a node // containing the current prefix and create two new ones for the tails. @@ -429,14 +423,14 @@ public class FusionDictionary implements Iterable { if (charIndex + differentCharIndex >= word.length) { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - shortcutTargets, bigrams, frequency, newChildren); + shortcutTargets, null /* bigrams */, frequency, newChildren); } else { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - null, null, -1, newChildren); - final CharGroup newWord = new CharGroup( - Arrays.copyOfRange(word, charIndex + differentCharIndex, - word.length), shortcutTargets, bigrams, frequency); + null /* shortcutTargets */, null /* bigrams */, -1, newChildren); + final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word, + charIndex + differentCharIndex, word.length), + shortcutTargets, null /* bigrams */, frequency); final int addIndex = word[charIndex + differentCharIndex] > currentGroup.mChars[differentCharIndex] ? 1 : 0; newChildren.mData.add(addIndex, newWord); @@ -494,7 +488,8 @@ public class FusionDictionary implements Iterable { */ private static int findInsertionIndex(final Node node, int character) { final ArrayList data = node.mData; - final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0); + final CharGroup reference = new CharGroup(new int[] { character }, + null /* shortcutTargets */, null /* bigrams */, 0); int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); return result >= 0 ? result : -result - 1; } diff --git a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java index d1d2a9ca4..d86719a1d 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java @@ -72,19 +72,15 @@ public class XmlDictInputOutput { int mFreq; // the currently read freq String mWord; // the current word final HashMap> mShortcutsMap; - final HashMap> mBigramsMap; /** * Create the handler. * * @param shortcuts the shortcuts as a map. This may be empty, but may not be null. - * @param bigrams the bigrams as a map. This may be empty, but may not be null. */ - public UnigramHandler(final HashMap> shortcuts, - final HashMap> bigrams) { + public UnigramHandler(final HashMap> shortcuts) { mDictionary = null; mShortcutsMap = shortcuts; - mBigramsMap = bigrams; mWord = ""; mState = START; mFreq = 0; @@ -94,7 +90,6 @@ public class XmlDictInputOutput { final FusionDictionary dict = mDictionary; mDictionary = null; mShortcutsMap.clear(); - mBigramsMap.clear(); mWord = ""; mState = START; mFreq = 0; @@ -143,7 +138,7 @@ public class XmlDictInputOutput { @Override public void endElement(String uri, String localName, String qName) { if (WORD == mState) { - mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord)); + mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord)); mState = START; } } @@ -191,6 +186,7 @@ public class XmlDictInputOutput { } } + // This may return an empty map, but will never return null. public HashMap> getAssocMap() { return mAssocMap; } @@ -211,6 +207,7 @@ public class XmlDictInputOutput { BIGRAM_FREQ_ATTRIBUTE); } + // As per getAssocMap(), this never returns null. public HashMap> getBigramMap() { return getAssocMap(); } @@ -231,6 +228,7 @@ public class XmlDictInputOutput { TARGET_PRIORITY_ATTRIBUTE); } + // As per getAssocMap(), this never returns null. public HashMap> getShortcutMap() { return getAssocMap(); } @@ -260,10 +258,19 @@ public class XmlDictInputOutput { if (null != shortcuts) parser.parse(shortcuts, shortcutHandler); final UnigramHandler unigramHandler = - new UnigramHandler(shortcutHandler.getShortcutMap(), - bigramHandler.getBigramMap()); + new UnigramHandler(shortcutHandler.getShortcutMap()); parser.parse(unigrams, unigramHandler); - return unigramHandler.getFinalDictionary(); + final FusionDictionary dict = unigramHandler.getFinalDictionary(); + final HashMap> bigramMap = bigramHandler.getBigramMap(); + for (final String firstWord : bigramMap.keySet()) { + if (!dict.hasWord(firstWord)) continue; + final ArrayList bigramList = bigramMap.get(firstWord); + for (final WeightedString bigram : bigramList) { + if (!dict.hasWord(bigram.mWord)) continue; + dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency); + } + } + return dict; } /** diff --git a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java index 191eb804d..24042f120 100644 --- a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java +++ b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java @@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase { final FusionDictionary dict = new FusionDictionary(new Node(), new DictionaryOptions(new HashMap(), false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */)); - dict.add("foo", 1, null, null); - dict.add("fta", 1, null, null); - dict.add("ftb", 1, null, null); - dict.add("bar", 1, null, null); - dict.add("fool", 1, null, null); + dict.add("foo", 1, null); + dict.add("fta", 1, null); + dict.add("ftb", 1, null); + dict.add("bar", 1, null); + dict.add("fool", 1, null); final ArrayList result = BinaryDictInputOutput.flattenTree(dict.mRoot); assertEquals(4, result.size()); while (!result.isEmpty()) {