Actually add shortcut-only entries.
Change-Id: I84bec8fb560cec2ad9cc857397a3f77a96b1d12dmain
parent
d64b8c97fe
commit
c599f2e9d6
|
@ -606,7 +606,9 @@ public class BinaryDictInputOutput {
|
||||||
}
|
}
|
||||||
flags |= FLAG_HAS_BIGRAMS;
|
flags |= FLAG_HAS_BIGRAMS;
|
||||||
}
|
}
|
||||||
// TODO: fill in the FLAG_IS_SHORTCUT_ONLY
|
if (group.mIsShortcutOnly) {
|
||||||
|
flags |= FLAG_IS_SHORTCUT_ONLY;
|
||||||
|
}
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -170,6 +170,24 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
return array;
|
return array;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method to add all words in a list as 0-frequency entries
|
||||||
|
*
|
||||||
|
* These words are added when shortcuts targets or bigrams are not found in the dictionary
|
||||||
|
* yet. The same words may be added later with an actual frequency - this is handled by
|
||||||
|
* the private version of add().
|
||||||
|
*/
|
||||||
|
private void addNeutralWords(final ArrayList<WeightedString> words) {
|
||||||
|
if (null != words) {
|
||||||
|
for (WeightedString word : words) {
|
||||||
|
final CharGroup t = findWordInTree(mRoot, word.mWord);
|
||||||
|
if (null == t) {
|
||||||
|
add(getCodePoints(word.mWord), 0, null, null, false /* isShortcutOnly */);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper method to add a word as a string.
|
* Helper method to add a word as a string.
|
||||||
*
|
*
|
||||||
|
@ -186,22 +204,12 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams) {
|
final ArrayList<WeightedString> bigrams) {
|
||||||
if (null != shortcutTargets) {
|
if (null != shortcutTargets) {
|
||||||
for (WeightedString target : shortcutTargets) {
|
addNeutralWords(shortcutTargets);
|
||||||
final CharGroup t = findWordInTree(mRoot, target.mWord);
|
|
||||||
if (null == t) {
|
|
||||||
add(getCodePoints(target.mWord), 0, null, null);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (null != bigrams) {
|
if (null != bigrams) {
|
||||||
for (WeightedString bigram : bigrams) {
|
addNeutralWords(bigrams);
|
||||||
final CharGroup t = findWordInTree(mRoot, bigram.mWord);
|
|
||||||
if (null == t) {
|
|
||||||
add(getCodePoints(bigram.mWord), 0, null, null);
|
|
||||||
}
|
}
|
||||||
}
|
add(getCodePoints(word), frequency, shortcutTargets, bigrams, false /* isShortcutOnly */);
|
||||||
}
|
|
||||||
add(getCodePoints(word), frequency, shortcutTargets, bigrams);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -222,6 +230,22 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method to add a shortcut that should not be a dictionary word.
|
||||||
|
*
|
||||||
|
* @param word the word to add.
|
||||||
|
* @param frequency the frequency of the word, in the range [0..255].
|
||||||
|
* @param shortcutTargets a list of shortcut targets. May not be null.
|
||||||
|
*/
|
||||||
|
public void addShortcutOnly(final String word, final int frequency,
|
||||||
|
final ArrayList<WeightedString> shortcutTargets) {
|
||||||
|
if (null == shortcutTargets) {
|
||||||
|
throw new RuntimeException("Can't add a shortcut without targets");
|
||||||
|
}
|
||||||
|
addNeutralWords(shortcutTargets);
|
||||||
|
add(getCodePoints(word), frequency, shortcutTargets, null, true /* isShortcutOnly */);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add a word to this dictionary.
|
* Add a word to this dictionary.
|
||||||
*
|
*
|
||||||
|
@ -232,10 +256,12 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
* @param frequency the frequency of the word, in the range [0..255].
|
* @param frequency the frequency of the word, in the range [0..255].
|
||||||
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
||||||
* @param bigrams an optional list of bigrams for this word (null if none).
|
* @param bigrams an optional list of bigrams for this word (null if none).
|
||||||
|
* @param isShortcutOnly whether this should be a shortcut only.
|
||||||
*/
|
*/
|
||||||
private void add(final int[] word, final int frequency,
|
private void add(final int[] word, final int frequency,
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams) {
|
final ArrayList<WeightedString> bigrams,
|
||||||
|
final boolean isShortcutOnly) {
|
||||||
assert(frequency >= 0 && frequency <= 255);
|
assert(frequency >= 0 && frequency <= 255);
|
||||||
Node currentNode = mRoot;
|
Node currentNode = mRoot;
|
||||||
int charIndex = 0;
|
int charIndex = 0;
|
||||||
|
@ -260,7 +286,7 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
|
final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
|
||||||
final CharGroup newGroup = new CharGroup(
|
final CharGroup newGroup = new CharGroup(
|
||||||
Arrays.copyOfRange(word, charIndex, word.length),
|
Arrays.copyOfRange(word, charIndex, word.length),
|
||||||
shortcutTargets, bigrams, frequency, false /* isShortcutOnly */);
|
shortcutTargets, bigrams, frequency, isShortcutOnly);
|
||||||
currentNode.mData.add(insertionIndex, newGroup);
|
currentNode.mData.add(insertionIndex, newGroup);
|
||||||
checkStack(currentNode);
|
checkStack(currentNode);
|
||||||
} else {
|
} else {
|
||||||
|
@ -275,7 +301,7 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
} else {
|
} else {
|
||||||
final CharGroup newNode = new CharGroup(currentGroup.mChars,
|
final CharGroup newNode = new CharGroup(currentGroup.mChars,
|
||||||
shortcutTargets, bigrams, frequency, currentGroup.mChildren,
|
shortcutTargets, bigrams, frequency, currentGroup.mChildren,
|
||||||
false /* isShortcutOnly */);
|
isShortcutOnly);
|
||||||
currentNode.mData.set(nodeIndex, newNode);
|
currentNode.mData.set(nodeIndex, newNode);
|
||||||
checkStack(currentNode);
|
checkStack(currentNode);
|
||||||
}
|
}
|
||||||
|
@ -284,8 +310,7 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
// We only have to create a new node and add it to the end of this.
|
// We only have to create a new node and add it to the end of this.
|
||||||
final CharGroup newNode = new CharGroup(
|
final CharGroup newNode = new CharGroup(
|
||||||
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
||||||
shortcutTargets, bigrams, frequency,
|
shortcutTargets, bigrams, frequency, isShortcutOnly);
|
||||||
false /* isShortcutOnly */);
|
|
||||||
currentGroup.mChildren = new Node();
|
currentGroup.mChildren = new Node();
|
||||||
currentGroup.mChildren.mData.add(newNode);
|
currentGroup.mChildren.mData.add(newNode);
|
||||||
}
|
}
|
||||||
|
@ -300,7 +325,8 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
}
|
}
|
||||||
final CharGroup newGroup = new CharGroup(word,
|
final CharGroup newGroup = new CharGroup(word,
|
||||||
currentGroup.mShortcutTargets, currentGroup.mBigrams,
|
currentGroup.mShortcutTargets, currentGroup.mBigrams,
|
||||||
frequency, currentGroup.mChildren, false /* isShortcutOnly */);
|
frequency, currentGroup.mChildren,
|
||||||
|
currentGroup.mIsShortcutOnly && isShortcutOnly);
|
||||||
currentNode.mData.set(nodeIndex, newGroup);
|
currentNode.mData.set(nodeIndex, newGroup);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -318,16 +344,18 @@ public class FusionDictionary implements Iterable<Word> {
|
||||||
if (charIndex + differentCharIndex >= word.length) {
|
if (charIndex + differentCharIndex >= word.length) {
|
||||||
newParent = new CharGroup(
|
newParent = new CharGroup(
|
||||||
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
||||||
shortcutTargets, bigrams, frequency, newChildren,
|
shortcutTargets, bigrams, frequency, newChildren, isShortcutOnly);
|
||||||
false /* isShortcutOnly */);
|
|
||||||
} else {
|
} else {
|
||||||
|
// isShortcutOnly makes no sense for non-terminal nodes. The following node
|
||||||
|
// is non-terminal (frequency 0 in FusionDictionary representation) so we
|
||||||
|
// pass false for isShortcutOnly
|
||||||
newParent = new CharGroup(
|
newParent = new CharGroup(
|
||||||
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
|
||||||
null, null, -1, newChildren, false /* isShortcutOnly */);
|
null, null, -1, newChildren, false /* isShortcutOnly */);
|
||||||
final CharGroup newWord = new CharGroup(
|
final CharGroup newWord = new CharGroup(
|
||||||
Arrays.copyOfRange(word, charIndex + differentCharIndex,
|
Arrays.copyOfRange(word, charIndex + differentCharIndex,
|
||||||
word.length), shortcutTargets, bigrams, frequency,
|
word.length), shortcutTargets, bigrams, frequency,
|
||||||
false /* isShortcutOnly */);
|
isShortcutOnly);
|
||||||
final int addIndex = word[charIndex + differentCharIndex]
|
final int addIndex = word[charIndex + differentCharIndex]
|
||||||
> currentGroup.mChars[differentCharIndex] ? 1 : 0;
|
> currentGroup.mChars[differentCharIndex] ? 1 : 0;
|
||||||
newChildren.mData.add(addIndex, newWord);
|
newChildren.mData.add(addIndex, newWord);
|
||||||
|
|
|
@ -46,6 +46,8 @@ public class XmlDictInputOutput {
|
||||||
private static final String FREQUENCY_ATTR = "f";
|
private static final String FREQUENCY_ATTR = "f";
|
||||||
private static final String WORD_ATTR = "word";
|
private static final String WORD_ATTR = "word";
|
||||||
|
|
||||||
|
private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* SAX handler for a unigram XML file.
|
* SAX handler for a unigram XML file.
|
||||||
*/
|
*/
|
||||||
|
@ -232,6 +234,15 @@ public class XmlDictInputOutput {
|
||||||
new UnigramHandler(dict, shortcutHandler.getShortcutMap(),
|
new UnigramHandler(dict, shortcutHandler.getShortcutMap(),
|
||||||
bigramHandler.getBigramMap());
|
bigramHandler.getBigramMap());
|
||||||
parser.parse(unigrams, unigramHandler);
|
parser.parse(unigrams, unigramHandler);
|
||||||
|
|
||||||
|
final HashMap<String, ArrayList<WeightedString>> shortcutMap =
|
||||||
|
shortcutHandler.getShortcutMap();
|
||||||
|
for (final String shortcut : shortcutMap.keySet()) {
|
||||||
|
if (dict.hasWord(shortcut)) continue;
|
||||||
|
// TODO: list a frequency in the shortcut file and use it here, instead of
|
||||||
|
// a constant freq
|
||||||
|
dict.addShortcutOnly(shortcut, SHORTCUT_ONLY_DEFAULT_FREQ, shortcutMap.get(shortcut));
|
||||||
|
}
|
||||||
return dict;
|
return dict;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue