Add shortcut support to the in-memory format of makedict (B3)

Change-Id: Icb8427c82694e24c8d08a5376157c7d1444af979
main
Jean Chalard 2011-12-26 19:00:00 +09:00
parent 3b161b2526
commit eec2e51e2c
4 changed files with 56 additions and 27 deletions

View File

@ -44,8 +44,9 @@ public class BinaryDictInputOutput {
* a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
* g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS
* s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
* | reserved 1 bit, 1 = yes, 0 = no * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
* | is shortcut only ? 1 bit, 1 = yes, 0 = no : FLAG_IS_SHORTCUT_ONLY
* *
* c | IF FLAG_HAS_MULTIPLE_CHARS * c | IF FLAG_HAS_MULTIPLE_CHARS
* h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
@ -71,6 +72,8 @@ public class BinaryDictInputOutput {
* d * d
* dress * dress
* *
* | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
* | shortcut targets address list
* | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS
* | bigrams address list * | bigrams address list
* *
@ -126,7 +129,9 @@ public class BinaryDictInputOutput {
private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20;
private static final int FLAG_IS_TERMINAL = 0x10; private static final int FLAG_IS_TERMINAL = 0x10;
private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
private static final int FLAG_HAS_BIGRAMS = 0x04; private static final int FLAG_HAS_BIGRAMS = 0x04;
private static final int FLAG_IS_SHORTCUT_ONLY = 0x02;
private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
@ -942,11 +947,13 @@ public class BinaryDictInputOutput {
source.seek(currentPosition); source.seek(currentPosition);
} }
nodeContents.add( nodeContents.add(
new CharGroup(info.mCharacters, bigrams, info.mFrequency, // TODO: read and pass the shortcut targets
new CharGroup(info.mCharacters, null, bigrams, info.mFrequency,
children)); children));
} else { } else {
// TODO: read and pass the shortcut targets
nodeContents.add( nodeContents.add(
new CharGroup(info.mCharacters, bigrams, info.mFrequency)); new CharGroup(info.mCharacters, null, bigrams, info.mFrequency));
} }
groupOffset = info.mEndAddress; groupOffset = info.mEndAddress;
} }
@ -996,7 +1003,8 @@ public class BinaryDictInputOutput {
new FusionDictionary.DictionaryOptions()); new FusionDictionary.DictionaryOptions());
if (null != dict) { if (null != dict) {
for (Word w : dict) { for (Word w : dict) {
newDict.add(w.mWord, w.mFrequency, w.mBigrams); // TODO: pass the shortcut targets
newDict.add(w.mWord, w.mFrequency, null, w.mBigrams);
} }
} }

View File

@ -68,7 +68,7 @@ public class FusionDictionary implements Iterable<Word> {
} }
/** /**
* A group of characters, with a frequency, shortcuts, bigrams, and children. * A group of characters, with a frequency, shortcut targets, bigrams, and children.
* *
* This is the central class of the in-memory representation. A CharGroup is what can * This is the central class of the in-memory representation. A CharGroup is what can
* be seen as a traditional "trie node", except it can hold several characters at the * be seen as a traditional "trie node", except it can hold several characters at the
@ -82,6 +82,7 @@ public class FusionDictionary implements Iterable<Word> {
public static class CharGroup { public static class CharGroup {
public static final int NOT_A_TERMINAL = -1; public static final int NOT_A_TERMINAL = -1;
final int mChars[]; final int mChars[];
final ArrayList<WeightedString> mShortcutTargets;
final ArrayList<WeightedString> mBigrams; final ArrayList<WeightedString> mBigrams;
final int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. final int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
Node mChildren; Node mChildren;
@ -89,18 +90,20 @@ public class FusionDictionary implements Iterable<Word> {
int mCachedSize; int mCachedSize;
int mCachedAddress; int mCachedAddress;
public CharGroup(final int[] chars, public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams, final int frequency) { final ArrayList<WeightedString> bigrams, final int frequency) {
mChars = chars; mChars = chars;
mFrequency = frequency; mFrequency = frequency;
mShortcutTargets = shortcutTargets;
mBigrams = bigrams; mBigrams = bigrams;
mChildren = null; mChildren = null;
} }
public CharGroup(final int[] chars, public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams, final int frequency, final Node children) { final ArrayList<WeightedString> bigrams, final int frequency, final Node children) {
mChars = chars; mChars = chars;
mFrequency = frequency; mFrequency = frequency;
mShortcutTargets = shortcutTargets;
mBigrams = bigrams; mBigrams = bigrams;
mChildren = children; mChildren = children;
} }
@ -165,18 +168,29 @@ public class FusionDictionary implements Iterable<Word> {
* *
* @param word the word to add. * @param word the word to add.
* @param frequency the frequency of the word, in the range [0..255]. * @param frequency the frequency of the word, in the range [0..255].
* @param shortcutTargets a list of shortcut targets for this word, or null.
* @param bigrams a list of bigrams, or null. * @param bigrams a list of bigrams, or null.
*/ */
public void add(String word, int frequency, ArrayList<WeightedString> bigrams) { public void add(final String word, final int frequency,
final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams) {
if (null != shortcutTargets) {
for (WeightedString target : shortcutTargets) {
final CharGroup t = findWordInTree(mRoot, target.mWord);
if (null == t) {
add(getCodePoints(target.mWord), 0, null, null);
}
}
}
if (null != bigrams) { if (null != bigrams) {
for (WeightedString bigram : bigrams) { for (WeightedString bigram : bigrams) {
final CharGroup t = findWordInTree(mRoot, bigram.mWord); final CharGroup t = findWordInTree(mRoot, bigram.mWord);
if (null == t) { if (null == t) {
add(getCodePoints(bigram.mWord), 0, null); add(getCodePoints(bigram.mWord), 0, null, null);
} }
} }
} }
add(getCodePoints(word), frequency, bigrams); add(getCodePoints(word), frequency, shortcutTargets, bigrams);
} }
/** /**
@ -200,14 +214,17 @@ public class FusionDictionary implements Iterable<Word> {
/** /**
* Add a word to this dictionary. * Add a word to this dictionary.
* *
* The bigrams, if any, have to be in the dictionary already. If they aren't, * The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't,
* an exception is thrown. * an exception is thrown.
* *
* @param word the word, as an int array. * @param word the word, as an int array.
* @param frequency the frequency of the word, in the range [0..255]. * @param frequency the frequency of the word, in the range [0..255].
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
* @param bigrams an optional list of bigrams for this word (null if none). * @param bigrams an optional list of bigrams for this word (null if none).
*/ */
private void add(int[] word, int frequency, ArrayList<WeightedString> bigrams) { private void add(final int[] word, final int frequency,
final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams) {
assert(frequency >= 0 && frequency <= 255); assert(frequency >= 0 && frequency <= 255);
Node currentNode = mRoot; Node currentNode = mRoot;
int charIndex = 0; int charIndex = 0;
@ -231,7 +248,8 @@ public class FusionDictionary implements Iterable<Word> {
// No node at this point to accept the word. Create one. // No node at this point to accept the word. Create one.
final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
final CharGroup newGroup = new CharGroup( final CharGroup newGroup = new CharGroup(
Arrays.copyOfRange(word, charIndex, word.length), bigrams, frequency); Arrays.copyOfRange(word, charIndex, word.length),
shortcutTargets, bigrams, frequency);
currentNode.mData.add(insertionIndex, newGroup); currentNode.mData.add(insertionIndex, newGroup);
checkStack(currentNode); checkStack(currentNode);
} else { } else {
@ -245,7 +263,7 @@ public class FusionDictionary implements Iterable<Word> {
+ new String(word, 0, word.length)); + new String(word, 0, word.length));
} else { } else {
final CharGroup newNode = new CharGroup(currentGroup.mChars, final CharGroup newNode = new CharGroup(currentGroup.mChars,
bigrams, frequency, currentGroup.mChildren); shortcutTargets, bigrams, frequency, currentGroup.mChildren);
currentNode.mData.set(nodeIndex, newNode); currentNode.mData.set(nodeIndex, newNode);
checkStack(currentNode); checkStack(currentNode);
} }
@ -254,7 +272,7 @@ public class FusionDictionary implements Iterable<Word> {
// We only have to create a new node and add it to the end of this. // We only have to create a new node and add it to the end of this.
final CharGroup newNode = new CharGroup( final CharGroup newNode = new CharGroup(
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
bigrams, frequency); shortcutTargets, bigrams, frequency);
currentGroup.mChildren = new Node(); currentGroup.mChildren = new Node();
currentGroup.mChildren.mData.add(newNode); currentGroup.mChildren.mData.add(newNode);
} }
@ -268,7 +286,8 @@ public class FusionDictionary implements Iterable<Word> {
+ new String(word, 0, word.length)); + new String(word, 0, word.length));
} }
final CharGroup newGroup = new CharGroup(word, final CharGroup newGroup = new CharGroup(word,
currentGroup.mBigrams, frequency, currentGroup.mChildren); currentGroup.mShortcutTargets, currentGroup.mBigrams,
frequency, currentGroup.mChildren);
currentNode.mData.set(nodeIndex, newGroup); currentNode.mData.set(nodeIndex, newGroup);
} }
} else { } else {
@ -277,7 +296,7 @@ public class FusionDictionary implements Iterable<Word> {
Node newChildren = new Node(); Node newChildren = new Node();
final CharGroup newOldWord = new CharGroup( final CharGroup newOldWord = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, Arrays.copyOfRange(currentGroup.mChars, differentCharIndex,
currentGroup.mChars.length), currentGroup.mChars.length), currentGroup.mShortcutTargets,
currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren); currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren);
newChildren.mData.add(newOldWord); newChildren.mData.add(newOldWord);
@ -285,14 +304,14 @@ public class FusionDictionary implements Iterable<Word> {
if (charIndex + differentCharIndex >= word.length) { if (charIndex + differentCharIndex >= word.length) {
newParent = new CharGroup( newParent = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
bigrams, frequency, newChildren); shortcutTargets, bigrams, frequency, newChildren);
} else { } else {
newParent = new CharGroup( newParent = new CharGroup(
Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
null, -1, newChildren); null, null, -1, newChildren);
final CharGroup newWord = new CharGroup( final CharGroup newWord = new CharGroup(
Arrays.copyOfRange(word, charIndex + differentCharIndex, Arrays.copyOfRange(word, charIndex + differentCharIndex,
word.length), bigrams, frequency); word.length), shortcutTargets, bigrams, frequency);
final int addIndex = word[charIndex + differentCharIndex] final int addIndex = word[charIndex + differentCharIndex]
> currentGroup.mChars[differentCharIndex] ? 1 : 0; > currentGroup.mChars[differentCharIndex] ? 1 : 0;
newChildren.mData.add(addIndex, newWord); newChildren.mData.add(addIndex, newWord);
@ -355,7 +374,7 @@ public class FusionDictionary implements Iterable<Word> {
*/ */
private static int findInsertionIndex(final Node node, int character) { private static int findInsertionIndex(final Node node, int character) {
final List data = node.mData; final List data = node.mData;
final CharGroup reference = new CharGroup(new int[] { character }, null, 0); final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0);
int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
return result >= 0 ? result : -result - 1; return result >= 0 ? result : -result - 1;
} }
@ -573,6 +592,7 @@ public class FusionDictionary implements Iterable<Word> {
} }
if (currentGroup.mFrequency >= 0) if (currentGroup.mFrequency >= 0)
return new Word(mCurrentString.toString(), currentGroup.mFrequency, return new Word(mCurrentString.toString(), currentGroup.mFrequency,
// TODO: pass the shortcut targets here
currentGroup.mBigrams); currentGroup.mBigrams);
} else { } else {
mPositions.removeLast(); mPositions.removeLast();

View File

@ -107,7 +107,8 @@ public class XmlDictInputOutput {
@Override @Override
public void endElement(String uri, String localName, String qName) { public void endElement(String uri, String localName, String qName) {
if (WORD == mState) { if (WORD == mState) {
mDictionary.add(mWord, mFreq, mBigramsMap.get(mWord)); // TODO: pass the shortcut targets
mDictionary.add(mWord, mFreq, null, mBigramsMap.get(mWord));
mState = START; mState = START;
} }
} }

View File

@ -39,11 +39,11 @@ public class BinaryDictInputOutputTest extends TestCase {
// that it does not contain any duplicates. // that it does not contain any duplicates.
public void testFlattenNodes() { public void testFlattenNodes() {
final FusionDictionary dict = new FusionDictionary(); final FusionDictionary dict = new FusionDictionary();
dict.add("foo", 1, null); dict.add("foo", 1, null, null);
dict.add("fta", 1, null); dict.add("fta", 1, null, null);
dict.add("ftb", 1, null); dict.add("ftb", 1, null, null);
dict.add("bar", 1, null); dict.add("bar", 1, null, null);
dict.add("fool", 1, null); dict.add("fool", 1, null, null);
final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot); final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
assertEquals(4, result.size()); assertEquals(4, result.size());
while (!result.isEmpty()) { while (!result.isEmpty()) {