am 329c8d7b: Merge "Ignore bigrams that are not also listed as unigrams" into jb-dev
* commit '329c8d7bcce4f785fa6e31df6cbda0c11014d49b': Ignore bigrams that are not also listed as unigramsmain
commit
a3c00c1530
|
@ -159,7 +159,7 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
|
||||||
// TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries,
|
// TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries,
|
||||||
// considering performance regression.
|
// considering performance regression.
|
||||||
protected void addWord(final String word, final int frequency) {
|
protected void addWord(final String word, final int frequency) {
|
||||||
mFusionDictionary.add(word, frequency, null, null);
|
mFusionDictionary.add(word, frequency, null /* shortcutTargets */);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -1317,8 +1317,16 @@ public class BinaryDictInputOutput {
|
||||||
0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG),
|
0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG),
|
||||||
0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
|
0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
|
||||||
if (null != dict) {
|
if (null != dict) {
|
||||||
for (Word w : dict) {
|
for (final Word w : dict) {
|
||||||
newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams);
|
newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets);
|
||||||
|
}
|
||||||
|
for (final Word w : dict) {
|
||||||
|
// By construction a binary dictionary may not have bigrams pointing to
|
||||||
|
// words that are not also registered as unigrams so we don't have to avoid
|
||||||
|
// them explicitly here.
|
||||||
|
for (final WeightedString bigram : w.mBigrams) {
|
||||||
|
newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -286,7 +286,7 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
for (WeightedString word : words) {
|
for (WeightedString word : words) {
|
||||||
final CharGroup t = findWordInTree(mRoot, word.mWord);
|
final CharGroup t = findWordInTree(mRoot, word.mWord);
|
||||||
if (null == t) {
|
if (null == t) {
|
||||||
add(getCodePoints(word.mWord), 0, null, null);
|
add(getCodePoints(word.mWord), 0, null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -305,12 +305,8 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
* @param bigrams a list of bigrams, or null.
|
* @param bigrams a list of bigrams, or null.
|
||||||
*/
|
*/
|
||||||
public void add(final String word, final int frequency,
|
public void add(final String word, final int frequency,
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets) {
|
||||||
final ArrayList<WeightedString> bigrams) {
|
add(getCodePoints(word), frequency, shortcutTargets);
|
||||||
if (null != bigrams) {
|
|
||||||
addNeutralWords(bigrams);
|
|
||||||
}
|
|
||||||
add(getCodePoints(word), frequency, shortcutTargets, bigrams);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -344,7 +340,7 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
final CharGroup charGroup2 = findWordInTree(mRoot, word2);
|
final CharGroup charGroup2 = findWordInTree(mRoot, word2);
|
||||||
if (charGroup2 == null) {
|
if (charGroup2 == null) {
|
||||||
// TODO: refactor with the identical code in addNeutralWords
|
// TODO: refactor with the identical code in addNeutralWords
|
||||||
add(getCodePoints(word2), 0, null, null);
|
add(getCodePoints(word2), 0, null);
|
||||||
}
|
}
|
||||||
charGroup.addBigram(word2, frequency);
|
charGroup.addBigram(word2, frequency);
|
||||||
} else {
|
} else {
|
||||||
|
@ -355,17 +351,15 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
/**
|
/**
|
||||||
* Add a word to this dictionary.
|
* Add a word to this dictionary.
|
||||||
*
|
*
|
||||||
* The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't,
|
* The shortcuts, if any, have to be in the dictionary already. If they aren't,
|
||||||
* an exception is thrown.
|
* an exception is thrown.
|
||||||
*
|
*
|
||||||
* @param word the word, as an int array.
|
* @param word the word, as an int array.
|
||||||
* @param frequency the frequency of the word, in the range [0..255].
|
* @param frequency the frequency of the word, in the range [0..255].
|
||||||
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
||||||
* @param bigrams an optional list of bigrams for this word (null if none).
|
|
||||||
*/
|
*/
|
||||||
private void add(final int[] word, final int frequency,
|
private void add(final int[] word, final int frequency,
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets) {
|
||||||
final ArrayList<WeightedString> bigrams) {
|
|
||||||
assert(frequency >= 0 && frequency <= 255);
|
assert(frequency >= 0 && frequency <= 255);
|
||||||
Node currentNode = mRoot;
|
Node currentNode = mRoot;
|
||||||
int charIndex = 0;
|
int charIndex = 0;
|
||||||
|
@ -390,7 +384,7 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
|
final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
|
||||||
final CharGroup newGroup = new CharGroup(
|
final CharGroup newGroup = new CharGroup(
|
||||||
Arrays.copyOfRange(word, charIndex, word.length),
|
Arrays.copyOfRange(word, charIndex, word.length),
|
||||||
shortcutTargets, bigrams, frequency);
|
shortcutTargets, null /* bigrams */, frequency);
|
||||||
currentNode.mData.add(insertionIndex, newGroup);
|
currentNode.mData.add(insertionIndex, newGroup);
|
||||||
checkStack(currentNode);
|
checkStack(currentNode);
|
||||||
} else {
|
} else {
|
||||||
|
@ -400,21 +394,21 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
// The new word is a prefix of an existing word, but the node on which it
|
// The new word is a prefix of an existing word, but the node on which it
|
||||||
// should end already exists as is. Since the old CharNode was not a terminal,
|
// should end already exists as is. Since the old CharNode was not a terminal,
|
||||||
// make it one by filling in its frequency and other attributes
|
// make it one by filling in its frequency and other attributes
|
||||||
currentGroup.update(frequency, shortcutTargets, bigrams);
|
currentGroup.update(frequency, shortcutTargets, null);
|
||||||
} else {
|
} else {
|
||||||
// The new word matches the full old word and extends past it.
|
// The new word matches the full old word and extends past it.
|
||||||
// We only have to create a new node and add it to the end of this.
|
// We only have to create a new node and add it to the end of this.
|
||||||
final CharGroup newNode = new CharGroup(
|
final CharGroup newNode = new CharGroup(
|
||||||
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
||||||
shortcutTargets, bigrams, frequency);
|
shortcutTargets, null /* bigrams */, frequency);
|
||||||
currentGroup.mChildren = new Node();
|
currentGroup.mChildren = new Node();
|
||||||
currentGroup.mChildren.mData.add(newNode);
|
currentGroup.mChildren.mData.add(newNode);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (0 == differentCharIndex) {
|
if (0 == differentCharIndex) {
|
||||||
// Exact same word. Update the frequency if higher. This will also add the
|
// Exact same word. Update the frequency if higher. This will also add the
|
||||||
// new bigrams to the existing bigram list if it already exists.
|
// new shortcuts to the existing shortcut list if it already exists.
|
||||||
currentGroup.update(frequency, shortcutTargets, bigrams);
|
currentGroup.update(frequency, shortcutTargets, null);
|
||||||
} else {
|
} else {
|
||||||
// Partial prefix match only. We have to replace the current node with a node
|
// Partial prefix match only. We have to replace the current node with a node
|
||||||
// containing the current prefix and create two new ones for the tails.
|
// containing the current prefix and create two new ones for the tails.
|
||||||
|
@ -429,14 +423,14 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
if (charIndex + differentCharIndex >= word.length) {
|
if (charIndex + differentCharIndex >= word.length) {
|
||||||
newParent = new CharGroup(
|
newParent = new CharGroup(
|
||||||
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
||||||
shortcutTargets, bigrams, frequency, newChildren);
|
shortcutTargets, null /* bigrams */, frequency, newChildren);
|
||||||
} else {
|
} else {
|
||||||
newParent = new CharGroup(
|
newParent = new CharGroup(
|
||||||
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
||||||
null, null, -1, newChildren);
|
null /* shortcutTargets */, null /* bigrams */, -1, newChildren);
|
||||||
final CharGroup newWord = new CharGroup(
|
final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
|
||||||
Arrays.copyOfRange(word, charIndex + differentCharIndex,
|
charIndex + differentCharIndex, word.length),
|
||||||
word.length), shortcutTargets, bigrams, frequency);
|
shortcutTargets, null /* bigrams */, frequency);
|
||||||
final int addIndex = word[charIndex + differentCharIndex]
|
final int addIndex = word[charIndex + differentCharIndex]
|
||||||
> currentGroup.mChars[differentCharIndex] ? 1 : 0;
|
> currentGroup.mChars[differentCharIndex] ? 1 : 0;
|
||||||
newChildren.mData.add(addIndex, newWord);
|
newChildren.mData.add(addIndex, newWord);
|
||||||
|
@ -494,7 +488,8 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
*/
|
*/
|
||||||
private static int findInsertionIndex(final Node node, int character) {
|
private static int findInsertionIndex(final Node node, int character) {
|
||||||
final ArrayList<CharGroup> data = node.mData;
|
final ArrayList<CharGroup> data = node.mData;
|
||||||
final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0);
|
final CharGroup reference = new CharGroup(new int[] { character },
|
||||||
|
null /* shortcutTargets */, null /* bigrams */, 0);
|
||||||
int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
|
int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
|
||||||
return result >= 0 ? result : -result - 1;
|
return result >= 0 ? result : -result - 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,19 +72,15 @@ public class XmlDictInputOutput {
|
||||||
int mFreq; // the currently read freq
|
int mFreq; // the currently read freq
|
||||||
String mWord; // the current word
|
String mWord; // the current word
|
||||||
final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
|
final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
|
||||||
final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create the handler.
|
* Create the handler.
|
||||||
*
|
*
|
||||||
* @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
|
* @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
|
||||||
* @param bigrams the bigrams as a map. This may be empty, but may not be null.
|
|
||||||
*/
|
*/
|
||||||
public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts,
|
public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
|
||||||
final HashMap<String, ArrayList<WeightedString>> bigrams) {
|
|
||||||
mDictionary = null;
|
mDictionary = null;
|
||||||
mShortcutsMap = shortcuts;
|
mShortcutsMap = shortcuts;
|
||||||
mBigramsMap = bigrams;
|
|
||||||
mWord = "";
|
mWord = "";
|
||||||
mState = START;
|
mState = START;
|
||||||
mFreq = 0;
|
mFreq = 0;
|
||||||
|
@ -94,7 +90,6 @@ public class XmlDictInputOutput {
|
||||||
final FusionDictionary dict = mDictionary;
|
final FusionDictionary dict = mDictionary;
|
||||||
mDictionary = null;
|
mDictionary = null;
|
||||||
mShortcutsMap.clear();
|
mShortcutsMap.clear();
|
||||||
mBigramsMap.clear();
|
|
||||||
mWord = "";
|
mWord = "";
|
||||||
mState = START;
|
mState = START;
|
||||||
mFreq = 0;
|
mFreq = 0;
|
||||||
|
@ -143,7 +138,7 @@ public class XmlDictInputOutput {
|
||||||
@Override
|
@Override
|
||||||
public void endElement(String uri, String localName, String qName) {
|
public void endElement(String uri, String localName, String qName) {
|
||||||
if (WORD == mState) {
|
if (WORD == mState) {
|
||||||
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord));
|
mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord));
|
||||||
mState = START;
|
mState = START;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -191,6 +186,7 @@ public class XmlDictInputOutput {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This may return an empty map, but will never return null.
|
||||||
public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
|
public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
|
||||||
return mAssocMap;
|
return mAssocMap;
|
||||||
}
|
}
|
||||||
|
@ -211,6 +207,7 @@ public class XmlDictInputOutput {
|
||||||
BIGRAM_FREQ_ATTRIBUTE);
|
BIGRAM_FREQ_ATTRIBUTE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// As per getAssocMap(), this never returns null.
|
||||||
public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
|
public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
|
||||||
return getAssocMap();
|
return getAssocMap();
|
||||||
}
|
}
|
||||||
|
@ -231,6 +228,7 @@ public class XmlDictInputOutput {
|
||||||
TARGET_PRIORITY_ATTRIBUTE);
|
TARGET_PRIORITY_ATTRIBUTE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// As per getAssocMap(), this never returns null.
|
||||||
public HashMap<String, ArrayList<WeightedString>> getShortcutMap() {
|
public HashMap<String, ArrayList<WeightedString>> getShortcutMap() {
|
||||||
return getAssocMap();
|
return getAssocMap();
|
||||||
}
|
}
|
||||||
|
@ -260,10 +258,19 @@ public class XmlDictInputOutput {
|
||||||
if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);
|
if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);
|
||||||
|
|
||||||
final UnigramHandler unigramHandler =
|
final UnigramHandler unigramHandler =
|
||||||
new UnigramHandler(shortcutHandler.getShortcutMap(),
|
new UnigramHandler(shortcutHandler.getShortcutMap());
|
||||||
bigramHandler.getBigramMap());
|
|
||||||
parser.parse(unigrams, unigramHandler);
|
parser.parse(unigrams, unigramHandler);
|
||||||
return unigramHandler.getFinalDictionary();
|
final FusionDictionary dict = unigramHandler.getFinalDictionary();
|
||||||
|
final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
|
||||||
|
for (final String firstWord : bigramMap.keySet()) {
|
||||||
|
if (!dict.hasWord(firstWord)) continue;
|
||||||
|
final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
|
||||||
|
for (final WeightedString bigram : bigramList) {
|
||||||
|
if (!dict.hasWord(bigram.mWord)) continue;
|
||||||
|
dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dict;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase {
|
||||||
final FusionDictionary dict = new FusionDictionary(new Node(),
|
final FusionDictionary dict = new FusionDictionary(new Node(),
|
||||||
new DictionaryOptions(new HashMap<String, String>(),
|
new DictionaryOptions(new HashMap<String, String>(),
|
||||||
false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */));
|
false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */));
|
||||||
dict.add("foo", 1, null, null);
|
dict.add("foo", 1, null);
|
||||||
dict.add("fta", 1, null, null);
|
dict.add("fta", 1, null);
|
||||||
dict.add("ftb", 1, null, null);
|
dict.add("ftb", 1, null);
|
||||||
dict.add("bar", 1, null, null);
|
dict.add("bar", 1, null);
|
||||||
dict.add("fool", 1, null, null);
|
dict.add("fool", 1, null);
|
||||||
final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
|
final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
|
||||||
assertEquals(4, result.size());
|
assertEquals(4, result.size());
|
||||||
while (!result.isEmpty()) {
|
while (!result.isEmpty()) {
|
||||||
|
|
Loading…
Reference in New Issue