Merge "Ignore bigrams that are not also listed as unigrams" into jb-dev

This commit is contained in:
Jean Chalard 2012-04-26 00:26:04 -07:00 committed by Android (Google) Code Review
commit 329c8d7bcc
5 changed files with 51 additions and 41 deletions

View file

@ -159,7 +159,7 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
// TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries, // TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries,
// considering performance regression. // considering performance regression.
protected void addWord(final String word, final int frequency) { protected void addWord(final String word, final int frequency) {
mFusionDictionary.add(word, frequency, null, null); mFusionDictionary.add(word, frequency, null /* shortcutTargets */);
} }
/** /**

View file

@ -1317,8 +1317,16 @@ public class BinaryDictInputOutput {
0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG), 0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG),
0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG))); 0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
if (null != dict) { if (null != dict) {
for (Word w : dict) { for (final Word w : dict) {
newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams); newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets);
}
for (final Word w : dict) {
// By construction a binary dictionary may not have bigrams pointing to
// words that are not also registered as unigrams so we don't have to avoid
// them explicitly here.
for (final WeightedString bigram : w.mBigrams) {
newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency);
}
} }
} }

View file

@ -286,7 +286,7 @@ public class FusionDictionary implements Iterable<Word> {
for (WeightedString word : words) { for (WeightedString word : words) {
final CharGroup t = findWordInTree(mRoot, word.mWord); final CharGroup t = findWordInTree(mRoot, word.mWord);
if (null == t) { if (null == t) {
add(getCodePoints(word.mWord), 0, null, null); add(getCodePoints(word.mWord), 0, null);
} }
} }
} }
@ -305,12 +305,8 @@ public class FusionDictionary implements Iterable<Word> {
* @param bigrams a list of bigrams, or null. * @param bigrams a list of bigrams, or null.
*/ */
public void add(final String word, final int frequency, public void add(final String word, final int frequency,
final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> shortcutTargets) {
final ArrayList<WeightedString> bigrams) { add(getCodePoints(word), frequency, shortcutTargets);
if (null != bigrams) {
addNeutralWords(bigrams);
}
add(getCodePoints(word), frequency, shortcutTargets, bigrams);
} }
/** /**
@ -344,7 +340,7 @@ public class FusionDictionary implements Iterable<Word> {
final CharGroup charGroup2 = findWordInTree(mRoot, word2); final CharGroup charGroup2 = findWordInTree(mRoot, word2);
if (charGroup2 == null) { if (charGroup2 == null) {
// TODO: refactor with the identical code in addNeutralWords // TODO: refactor with the identical code in addNeutralWords
add(getCodePoints(word2), 0, null, null); add(getCodePoints(word2), 0, null);
} }
charGroup.addBigram(word2, frequency); charGroup.addBigram(word2, frequency);
} else { } else {
@ -355,17 +351,15 @@ public class FusionDictionary implements Iterable<Word> {
/** /**
* Add a word to this dictionary. * Add a word to this dictionary.
* *
* The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't, * The shortcuts, if any, have to be in the dictionary already. If they aren't,
* an exception is thrown. * an exception is thrown.
* *
* @param word the word, as an int array. * @param word the word, as an int array.
* @param frequency the frequency of the word, in the range [0..255]. * @param frequency the frequency of the word, in the range [0..255].
* @param shortcutTargets an optional list of shortcut targets for this word (null if none). * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
* @param bigrams an optional list of bigrams for this word (null if none).
*/ */
private void add(final int[] word, final int frequency, private void add(final int[] word, final int frequency,
final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> shortcutTargets) {
final ArrayList<WeightedString> bigrams) {
assert(frequency >= 0 && frequency <= 255); assert(frequency >= 0 && frequency <= 255);
Node currentNode = mRoot; Node currentNode = mRoot;
int charIndex = 0; int charIndex = 0;
@ -390,7 +384,7 @@ public class FusionDictionary implements Iterable<Word> {
final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
final CharGroup newGroup = new CharGroup( final CharGroup newGroup = new CharGroup(
Arrays.copyOfRange(word, charIndex, word.length), Arrays.copyOfRange(word, charIndex, word.length),
shortcutTargets, bigrams, frequency); shortcutTargets, null /* bigrams */, frequency);
currentNode.mData.add(insertionIndex, newGroup); currentNode.mData.add(insertionIndex, newGroup);
checkStack(currentNode); checkStack(currentNode);
} else { } else {
@ -400,21 +394,21 @@ public class FusionDictionary implements Iterable<Word> {
// The new word is a prefix of an existing word, but the node on which it // The new word is a prefix of an existing word, but the node on which it
// should end already exists as is. Since the old CharNode was not a terminal, // should end already exists as is. Since the old CharNode was not a terminal,
// make it one by filling in its frequency and other attributes // make it one by filling in its frequency and other attributes
currentGroup.update(frequency, shortcutTargets, bigrams); currentGroup.update(frequency, shortcutTargets, null);
} else { } else {
// The new word matches the full old word and extends past it. // The new word matches the full old word and extends past it.
// We only have to create a new node and add it to the end of this. // We only have to create a new node and add it to the end of this.
final CharGroup newNode = new CharGroup( final CharGroup newNode = new CharGroup(
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
shortcutTargets, bigrams, frequency); shortcutTargets, null /* bigrams */, frequency);
currentGroup.mChildren = new Node(); currentGroup.mChildren = new Node();
currentGroup.mChildren.mData.add(newNode); currentGroup.mChildren.mData.add(newNode);
} }
} else { } else {
if (0 == differentCharIndex) { if (0 == differentCharIndex) {
// Exact same word. Update the frequency if higher. This will also add the // Exact same word. Update the frequency if higher. This will also add the
// new bigrams to the existing bigram list if it already exists. // new shortcuts to the existing shortcut list if it already exists.
currentGroup.update(frequency, shortcutTargets, bigrams); currentGroup.update(frequency, shortcutTargets, null);
} else { } else {
// Partial prefix match only. We have to replace the current node with a node // Partial prefix match only. We have to replace the current node with a node
// containing the current prefix and create two new ones for the tails. // containing the current prefix and create two new ones for the tails.
@ -429,14 +423,14 @@ public class FusionDictionary implements Iterable<Word> {
if (charIndex + differentCharIndex >= word.length) { if (charIndex + differentCharIndex >= word.length) {
newParent = new CharGroup( newParent = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
shortcutTargets, bigrams, frequency, newChildren); shortcutTargets, null /* bigrams */, frequency, newChildren);
} else { } else {
newParent = new CharGroup( newParent = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
null, null, -1, newChildren); null /* shortcutTargets */, null /* bigrams */, -1, newChildren);
final CharGroup newWord = new CharGroup( final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
Arrays.copyOfRange(word, charIndex + differentCharIndex, charIndex + differentCharIndex, word.length),
word.length), shortcutTargets, bigrams, frequency); shortcutTargets, null /* bigrams */, frequency);
final int addIndex = word[charIndex + differentCharIndex] final int addIndex = word[charIndex + differentCharIndex]
> currentGroup.mChars[differentCharIndex] ? 1 : 0; > currentGroup.mChars[differentCharIndex] ? 1 : 0;
newChildren.mData.add(addIndex, newWord); newChildren.mData.add(addIndex, newWord);
@ -494,7 +488,8 @@ public class FusionDictionary implements Iterable<Word> {
*/ */
private static int findInsertionIndex(final Node node, int character) { private static int findInsertionIndex(final Node node, int character) {
final ArrayList<CharGroup> data = node.mData; final ArrayList<CharGroup> data = node.mData;
final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0); final CharGroup reference = new CharGroup(new int[] { character },
null /* shortcutTargets */, null /* bigrams */, 0);
int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
return result >= 0 ? result : -result - 1; return result >= 0 ? result : -result - 1;
} }

View file

@ -72,19 +72,15 @@ public class XmlDictInputOutput {
int mFreq; // the currently read freq int mFreq; // the currently read freq
String mWord; // the current word String mWord; // the current word
final HashMap<String, ArrayList<WeightedString>> mShortcutsMap; final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
/** /**
* Create the handler. * Create the handler.
* *
* @param shortcuts the shortcuts as a map. This may be empty, but may not be null. * @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
* @param bigrams the bigrams as a map. This may be empty, but may not be null.
*/ */
public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts, public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
final HashMap<String, ArrayList<WeightedString>> bigrams) {
mDictionary = null; mDictionary = null;
mShortcutsMap = shortcuts; mShortcutsMap = shortcuts;
mBigramsMap = bigrams;
mWord = ""; mWord = "";
mState = START; mState = START;
mFreq = 0; mFreq = 0;
@ -94,7 +90,6 @@ public class XmlDictInputOutput {
final FusionDictionary dict = mDictionary; final FusionDictionary dict = mDictionary;
mDictionary = null; mDictionary = null;
mShortcutsMap.clear(); mShortcutsMap.clear();
mBigramsMap.clear();
mWord = ""; mWord = "";
mState = START; mState = START;
mFreq = 0; mFreq = 0;
@ -143,7 +138,7 @@ public class XmlDictInputOutput {
@Override @Override
public void endElement(String uri, String localName, String qName) { public void endElement(String uri, String localName, String qName) {
if (WORD == mState) { if (WORD == mState) {
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord)); mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord));
mState = START; mState = START;
} }
} }
@ -191,6 +186,7 @@ public class XmlDictInputOutput {
} }
} }
// This may return an empty map, but will never return null.
public HashMap<String, ArrayList<WeightedString>> getAssocMap() { public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
return mAssocMap; return mAssocMap;
} }
@ -211,6 +207,7 @@ public class XmlDictInputOutput {
BIGRAM_FREQ_ATTRIBUTE); BIGRAM_FREQ_ATTRIBUTE);
} }
// As per getAssocMap(), this never returns null.
public HashMap<String, ArrayList<WeightedString>> getBigramMap() { public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
return getAssocMap(); return getAssocMap();
} }
@ -231,6 +228,7 @@ public class XmlDictInputOutput {
TARGET_PRIORITY_ATTRIBUTE); TARGET_PRIORITY_ATTRIBUTE);
} }
// As per getAssocMap(), this never returns null.
public HashMap<String, ArrayList<WeightedString>> getShortcutMap() { public HashMap<String, ArrayList<WeightedString>> getShortcutMap() {
return getAssocMap(); return getAssocMap();
} }
@ -260,10 +258,19 @@ public class XmlDictInputOutput {
if (null != shortcuts) parser.parse(shortcuts, shortcutHandler); if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);
final UnigramHandler unigramHandler = final UnigramHandler unigramHandler =
new UnigramHandler(shortcutHandler.getShortcutMap(), new UnigramHandler(shortcutHandler.getShortcutMap());
bigramHandler.getBigramMap());
parser.parse(unigrams, unigramHandler); parser.parse(unigrams, unigramHandler);
return unigramHandler.getFinalDictionary(); final FusionDictionary dict = unigramHandler.getFinalDictionary();
final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
for (final String firstWord : bigramMap.keySet()) {
if (!dict.hasWord(firstWord)) continue;
final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
for (final WeightedString bigram : bigramList) {
if (!dict.hasWord(bigram.mWord)) continue;
dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency);
}
}
return dict;
} }
/** /**

View file

@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase {
final FusionDictionary dict = new FusionDictionary(new Node(), final FusionDictionary dict = new FusionDictionary(new Node(),
new DictionaryOptions(new HashMap<String, String>(), new DictionaryOptions(new HashMap<String, String>(),
false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */)); false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */));
dict.add("foo", 1, null, null); dict.add("foo", 1, null);
dict.add("fta", 1, null, null); dict.add("fta", 1, null);
dict.add("ftb", 1, null, null); dict.add("ftb", 1, null);
dict.add("bar", 1, null, null); dict.add("bar", 1, null);
dict.add("fool", 1, null, null); dict.add("fool", 1, null);
final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot); final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
assertEquals(4, result.size()); assertEquals(4, result.size());
while (!result.isEmpty()) { while (!result.isEmpty()) {