From 7cfe20efbeb4a94b15291aee95d0559ae2449c45 Mon Sep 17 00:00:00 2001 From: Tom Ouyang Date: Wed, 21 Mar 2012 23:55:10 +0900 Subject: [PATCH] Add support for updating and adding bigrams to existing nodes. Bug: 6188977 Change-Id: I48aca8ba199247d73395ab13b9d1976f4e739208 --- .../latin/makedict/FusionDictionary.java | 157 ++++++++++++++---- 1 file changed, 128 insertions(+), 29 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index e88ab685a..9dc294edf 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -59,7 +59,7 @@ public class FusionDictionary implements Iterable { */ public static class WeightedString { final String mWord; - final int mFrequency; + int mFrequency; public WeightedString(String word, int frequency) { mWord = word; mFrequency = frequency; @@ -81,10 +81,10 @@ public class FusionDictionary implements Iterable { public static class CharGroup { public static final int NOT_A_TERMINAL = -1; final int mChars[]; - final ArrayList mShortcutTargets; - final ArrayList mBigrams; - final int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. - final boolean mIsShortcutOnly; // Only valid if this is a terminal. + ArrayList mShortcutTargets; + ArrayList mBigrams; + int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. + boolean mIsShortcutOnly; // Only valid if this is a terminal. Node mChildren; // The two following members to help with binary generation int mCachedSize; @@ -133,6 +133,102 @@ public class FusionDictionary implements Iterable { assert(mChars.length > 0); return 1 < mChars.length; } + + /** + * Adds a word to the bigram list. Updates the frequency if the word already + * exists. + */ + public void addBigram(final String word, final int frequency) { + if (mBigrams == null) { + mBigrams = new ArrayList(); + } + WeightedString bigram = getBigram(word); + if (bigram != null) { + bigram.mFrequency = frequency; + } else { + bigram = new WeightedString(word, frequency); + mBigrams.add(bigram); + } + } + + /** + * Gets the shortcut target for the given word. Returns null if the word is not in the + * shortcut list. + */ + public WeightedString getShortcut(final String word) { + if (mShortcutTargets != null) { + final int size = mShortcutTargets.size(); + for (int i = 0; i < size; ++i) { + WeightedString shortcut = mShortcutTargets.get(i); + if (shortcut.mWord.equals(word)) { + return shortcut; + } + } + } + return null; + } + + /** + * Gets the bigram for the given word. + * Returns null if the word is not in the bigrams list. + */ + public WeightedString getBigram(final String word) { + if (mBigrams != null) { + final int size = mBigrams.size(); + for (int i = 0; i < size; ++i) { + WeightedString bigram = mBigrams.get(i); + if (bigram.mWord.equals(word)) { + return bigram; + } + } + } + return null; + } + + /** + * Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to + * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only + * updated if they are higher than the existing ones. + */ + public void update(int frequency, ArrayList shortcutTargets, + ArrayList bigrams, boolean isShortcutOnly) { + if (frequency > mFrequency) { + mFrequency = frequency; + } + if (shortcutTargets != null) { + if (mShortcutTargets == null) { + mShortcutTargets = shortcutTargets; + } else { + final int size = shortcutTargets.size(); + for (int i = 0; i < size; ++i) { + final WeightedString shortcut = shortcutTargets.get(i); + final WeightedString existingShortcut = getShortcut(shortcut.mWord); + if (existingShortcut == null) { + mShortcutTargets.add(shortcut); + } else if (existingShortcut.mFrequency < shortcut.mFrequency) { + existingShortcut.mFrequency = shortcut.mFrequency; + } + } + } + } + if (bigrams != null) { + if (mBigrams == null) { + mBigrams = bigrams; + } else { + final int size = bigrams.size(); + for (int i = 0; i < size; ++i) { + final WeightedString bigram = bigrams.get(i); + final WeightedString existingBigram = getBigram(bigram.mWord); + if (existingBigram == null) { + mBigrams.add(bigram); + } else if (existingBigram.mFrequency < bigram.mFrequency) { + existingBigram.mFrequency = bigram.mFrequency; + } + } + } + } + mIsShortcutOnly = isShortcutOnly; + } } /** @@ -245,6 +341,27 @@ public class FusionDictionary implements Iterable { add(getCodePoints(word), frequency, shortcutTargets, null, true /* isShortcutOnly */); } + /** + * Helper method to add a new bigram to the dictionary. + * + * @param word1 the previous word of the context + * @param word2 the next word of the context + * @param frequency the bigram frequency + */ + public void setBigram(final String word1, final String word2, final int frequency) { + CharGroup charGroup = findWordInTree(mRoot, word1); + if (charGroup != null) { + final CharGroup charGroup2 = findWordInTree(mRoot, word2); + if (charGroup2 == null) { + // TODO: refactor with the identical code in addNeutralWords + add(getCodePoints(word2), 0, null, null, false /* isShortcutOnly */); + } + charGroup.addBigram(word2, frequency); + } else { + throw new RuntimeException("First word of bigram not found"); + } + } + /** * Add a word to this dictionary. * @@ -293,17 +410,9 @@ public class FusionDictionary implements Iterable { if (differentCharIndex == currentGroup.mChars.length) { if (charIndex + differentCharIndex >= word.length) { // The new word is a prefix of an existing word, but the node on which it - // should end already exists as is. - if (currentGroup.mFrequency > 0) { - throw new RuntimeException("Such a word already exists in the dictionary : " - + new String(word, 0, word.length)); - } else { - final CharGroup newNode = new CharGroup(currentGroup.mChars, - shortcutTargets, bigrams, frequency, currentGroup.mChildren, - isShortcutOnly); - currentNode.mData.set(nodeIndex, newNode); - checkStack(currentNode); - } + // should end already exists as is. Since the old CharNode was not a terminal, + // make it one by filling in its frequency and other attributes + currentGroup.update(frequency, shortcutTargets, bigrams, isShortcutOnly); } else { // The new word matches the full old word and extends past it. // We only have to create a new node and add it to the end of this. @@ -315,19 +424,9 @@ public class FusionDictionary implements Iterable { } } else { if (0 == differentCharIndex) { - // Exact same word. Check the frequency is 0 or NOT_A_TERMINAL, and update. - if (0 != frequency) { - if (0 < currentGroup.mFrequency) { - throw new RuntimeException("This word already exists with frequency " - + currentGroup.mFrequency + " : " - + new String(word, 0, word.length)); - } - final CharGroup newGroup = new CharGroup(word, - currentGroup.mShortcutTargets, currentGroup.mBigrams, - frequency, currentGroup.mChildren, - currentGroup.mIsShortcutOnly && isShortcutOnly); - currentNode.mData.set(nodeIndex, newGroup); - } + // Exact same word. Update the frequency if higher. This will also add the + // new bigrams to the existing bigram list if it already exists. + currentGroup.update(frequency, shortcutTargets, bigrams, isShortcutOnly); } else { // Partial prefix match only. We have to replace the current node with a node // containing the current prefix and create two new ones for the tails.