Add support for updating and adding bigrams to existing nodes.

Bug: 6188977
Change-Id: I48aca8ba199247d73395ab13b9d1976f4e739208
This commit is contained in:
Tom Ouyang 2012-03-21 23:55:10 +09:00
parent 19b8a73efa
commit 7cfe20efbe

View file

@ -59,7 +59,7 @@ public class FusionDictionary implements Iterable<Word> {
*/ */
public static class WeightedString { public static class WeightedString {
final String mWord; final String mWord;
final int mFrequency; int mFrequency;
public WeightedString(String word, int frequency) { public WeightedString(String word, int frequency) {
mWord = word; mWord = word;
mFrequency = frequency; mFrequency = frequency;
@ -81,10 +81,10 @@ public class FusionDictionary implements Iterable<Word> {
public static class CharGroup { public static class CharGroup {
public static final int NOT_A_TERMINAL = -1; public static final int NOT_A_TERMINAL = -1;
final int mChars[]; final int mChars[];
final ArrayList<WeightedString> mShortcutTargets; ArrayList<WeightedString> mShortcutTargets;
final ArrayList<WeightedString> mBigrams; ArrayList<WeightedString> mBigrams;
final int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
final boolean mIsShortcutOnly; // Only valid if this is a terminal. boolean mIsShortcutOnly; // Only valid if this is a terminal.
Node mChildren; Node mChildren;
// The two following members to help with binary generation // The two following members to help with binary generation
int mCachedSize; int mCachedSize;
@ -133,6 +133,102 @@ public class FusionDictionary implements Iterable<Word> {
assert(mChars.length > 0); assert(mChars.length > 0);
return 1 < mChars.length; return 1 < mChars.length;
} }
/**
* Adds a word to the bigram list. Updates the frequency if the word already
* exists.
*/
public void addBigram(final String word, final int frequency) {
if (mBigrams == null) {
mBigrams = new ArrayList<WeightedString>();
}
WeightedString bigram = getBigram(word);
if (bigram != null) {
bigram.mFrequency = frequency;
} else {
bigram = new WeightedString(word, frequency);
mBigrams.add(bigram);
}
}
/**
* Gets the shortcut target for the given word. Returns null if the word is not in the
* shortcut list.
*/
public WeightedString getShortcut(final String word) {
if (mShortcutTargets != null) {
final int size = mShortcutTargets.size();
for (int i = 0; i < size; ++i) {
WeightedString shortcut = mShortcutTargets.get(i);
if (shortcut.mWord.equals(word)) {
return shortcut;
}
}
}
return null;
}
/**
* Gets the bigram for the given word.
* Returns null if the word is not in the bigrams list.
*/
public WeightedString getBigram(final String word) {
if (mBigrams != null) {
final int size = mBigrams.size();
for (int i = 0; i < size; ++i) {
WeightedString bigram = mBigrams.get(i);
if (bigram.mWord.equals(word)) {
return bigram;
}
}
}
return null;
}
/**
* Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to
* the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
* updated if they are higher than the existing ones.
*/
public void update(int frequency, ArrayList<WeightedString> shortcutTargets,
ArrayList<WeightedString> bigrams, boolean isShortcutOnly) {
if (frequency > mFrequency) {
mFrequency = frequency;
}
if (shortcutTargets != null) {
if (mShortcutTargets == null) {
mShortcutTargets = shortcutTargets;
} else {
final int size = shortcutTargets.size();
for (int i = 0; i < size; ++i) {
final WeightedString shortcut = shortcutTargets.get(i);
final WeightedString existingShortcut = getShortcut(shortcut.mWord);
if (existingShortcut == null) {
mShortcutTargets.add(shortcut);
} else if (existingShortcut.mFrequency < shortcut.mFrequency) {
existingShortcut.mFrequency = shortcut.mFrequency;
}
}
}
}
if (bigrams != null) {
if (mBigrams == null) {
mBigrams = bigrams;
} else {
final int size = bigrams.size();
for (int i = 0; i < size; ++i) {
final WeightedString bigram = bigrams.get(i);
final WeightedString existingBigram = getBigram(bigram.mWord);
if (existingBigram == null) {
mBigrams.add(bigram);
} else if (existingBigram.mFrequency < bigram.mFrequency) {
existingBigram.mFrequency = bigram.mFrequency;
}
}
}
}
mIsShortcutOnly = isShortcutOnly;
}
} }
/** /**
@ -245,6 +341,27 @@ public class FusionDictionary implements Iterable<Word> {
add(getCodePoints(word), frequency, shortcutTargets, null, true /* isShortcutOnly */); add(getCodePoints(word), frequency, shortcutTargets, null, true /* isShortcutOnly */);
} }
/**
* Helper method to add a new bigram to the dictionary.
*
* @param word1 the previous word of the context
* @param word2 the next word of the context
* @param frequency the bigram frequency
*/
public void setBigram(final String word1, final String word2, final int frequency) {
CharGroup charGroup = findWordInTree(mRoot, word1);
if (charGroup != null) {
final CharGroup charGroup2 = findWordInTree(mRoot, word2);
if (charGroup2 == null) {
// TODO: refactor with the identical code in addNeutralWords
add(getCodePoints(word2), 0, null, null, false /* isShortcutOnly */);
}
charGroup.addBigram(word2, frequency);
} else {
throw new RuntimeException("First word of bigram not found");
}
}
/** /**
* Add a word to this dictionary. * Add a word to this dictionary.
* *
@ -293,17 +410,9 @@ public class FusionDictionary implements Iterable<Word> {
if (differentCharIndex == currentGroup.mChars.length) { if (differentCharIndex == currentGroup.mChars.length) {
if (charIndex + differentCharIndex >= word.length) { if (charIndex + differentCharIndex >= word.length) {
// The new word is a prefix of an existing word, but the node on which it // The new word is a prefix of an existing word, but the node on which it
// should end already exists as is. // should end already exists as is. Since the old CharNode was not a terminal,
if (currentGroup.mFrequency > 0) { // make it one by filling in its frequency and other attributes
throw new RuntimeException("Such a word already exists in the dictionary : " currentGroup.update(frequency, shortcutTargets, bigrams, isShortcutOnly);
+ new String(word, 0, word.length));
} else {
final CharGroup newNode = new CharGroup(currentGroup.mChars,
shortcutTargets, bigrams, frequency, currentGroup.mChildren,
isShortcutOnly);
currentNode.mData.set(nodeIndex, newNode);
checkStack(currentNode);
}
} else { } else {
// The new word matches the full old word and extends past it. // The new word matches the full old word and extends past it.
// We only have to create a new node and add it to the end of this. // We only have to create a new node and add it to the end of this.
@ -315,19 +424,9 @@ public class FusionDictionary implements Iterable<Word> {
} }
} else { } else {
if (0 == differentCharIndex) { if (0 == differentCharIndex) {
// Exact same word. Check the frequency is 0 or NOT_A_TERMINAL, and update. // Exact same word. Update the frequency if higher. This will also add the
if (0 != frequency) { // new bigrams to the existing bigram list if it already exists.
if (0 < currentGroup.mFrequency) { currentGroup.update(frequency, shortcutTargets, bigrams, isShortcutOnly);
throw new RuntimeException("This word already exists with frequency "
+ currentGroup.mFrequency + " : "
+ new String(word, 0, word.length));
}
final CharGroup newGroup = new CharGroup(word,
currentGroup.mShortcutTargets, currentGroup.mBigrams,
frequency, currentGroup.mChildren,
currentGroup.mIsShortcutOnly && isShortcutOnly);
currentNode.mData.set(nodeIndex, newGroup);
}
} else { } else {
// Partial prefix match only. We have to replace the current node with a node // Partial prefix match only. We have to replace the current node with a node
// containing the current prefix and create two new ones for the tails. // containing the current prefix and create two new ones for the tails.