Add methods to read and write shortcut to binary files (B5)
Change-Id: I8e6a4242a73b2ec95ce9e8b4739e16dfeb9f1204main
parent
b0c49b7684
commit
8edd306718
|
@ -26,6 +26,7 @@ import java.io.OutputStream;
|
||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
@ -276,10 +277,13 @@ public class BinaryDictInputOutput {
|
||||||
// If terminal, one byte for the frequency
|
// If terminal, one byte for the frequency
|
||||||
if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE;
|
if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE;
|
||||||
size += GROUP_MAX_ADDRESS_SIZE; // For children address
|
size += GROUP_MAX_ADDRESS_SIZE; // For children address
|
||||||
|
if (null != group.mShortcutTargets) {
|
||||||
|
size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE)
|
||||||
|
* group.mShortcutTargets.size();
|
||||||
|
}
|
||||||
if (null != group.mBigrams) {
|
if (null != group.mBigrams) {
|
||||||
for (WeightedString bigram : group.mBigrams) {
|
size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE)
|
||||||
size += GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE;
|
* group.mBigrams.size();
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
@ -392,6 +396,15 @@ public class BinaryDictInputOutput {
|
||||||
final int offset = group.mChildren.mCachedAddress - offsetBasePoint;
|
final int offset = group.mChildren.mCachedAddress - offsetBasePoint;
|
||||||
groupSize += getByteSize(offset);
|
groupSize += getByteSize(offset);
|
||||||
}
|
}
|
||||||
|
if (null != group.mShortcutTargets) {
|
||||||
|
for (WeightedString target : group.mShortcutTargets) {
|
||||||
|
final int offsetBasePoint = groupSize + node.mCachedAddress + size
|
||||||
|
+ GROUP_FLAGS_SIZE;
|
||||||
|
final int addressOfTarget = findAddressOfWord(dict, target.mWord);
|
||||||
|
final int offset = addressOfTarget - offsetBasePoint;
|
||||||
|
groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (null != group.mBigrams) {
|
if (null != group.mBigrams) {
|
||||||
for (WeightedString bigram : group.mBigrams) {
|
for (WeightedString bigram : group.mBigrams) {
|
||||||
final int offsetBasePoint = groupSize + node.mCachedAddress + size
|
final int offsetBasePoint = groupSize + node.mCachedAddress + size
|
||||||
|
@ -550,7 +563,19 @@ public class BinaryDictInputOutput {
|
||||||
throw new RuntimeException("Node with a strange address");
|
throw new RuntimeException("Node with a strange address");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (null != group.mBigrams) flags |= FLAG_HAS_BIGRAMS;
|
if (null != group.mShortcutTargets) {
|
||||||
|
if (0 == group.mShortcutTargets.size()) {
|
||||||
|
throw new RuntimeException("0-sized shortcut list must be null");
|
||||||
|
}
|
||||||
|
flags |= FLAG_HAS_SHORTCUT_TARGETS;
|
||||||
|
}
|
||||||
|
if (null != group.mBigrams) {
|
||||||
|
if (0 == group.mBigrams.size()) {
|
||||||
|
throw new RuntimeException("0-sized bigram list must be null");
|
||||||
|
}
|
||||||
|
flags |= FLAG_HAS_BIGRAMS;
|
||||||
|
}
|
||||||
|
// TODO: fill in the FLAG_IS_SHORTCUT_ONLY
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -629,20 +654,36 @@ public class BinaryDictInputOutput {
|
||||||
index += shift;
|
index += shift;
|
||||||
groupAddress += shift;
|
groupAddress += shift;
|
||||||
|
|
||||||
|
// Write shortcuts
|
||||||
|
if (null != group.mShortcutTargets) {
|
||||||
|
final Iterator shortcutIterator = group.mShortcutTargets.iterator();
|
||||||
|
while (shortcutIterator.hasNext()) {
|
||||||
|
final WeightedString target = (WeightedString)shortcutIterator.next();
|
||||||
|
final int addressOfTarget = findAddressOfWord(dict, target.mWord);
|
||||||
|
++groupAddress;
|
||||||
|
final int offset = addressOfTarget - groupAddress;
|
||||||
|
int shortcutFlags = makeAttributeFlags(shortcutIterator.hasNext(), offset,
|
||||||
|
target.mFrequency);
|
||||||
|
buffer[index++] = (byte)shortcutFlags;
|
||||||
|
final int shortcutShift = writeVariableAddress(buffer, index, Math.abs(offset));
|
||||||
|
index += shortcutShift;
|
||||||
|
groupAddress += shortcutShift;
|
||||||
|
}
|
||||||
|
}
|
||||||
// Write bigrams
|
// Write bigrams
|
||||||
if (null != group.mBigrams) {
|
if (null != group.mBigrams) {
|
||||||
int remainingBigrams = group.mBigrams.size();
|
final Iterator bigramIterator = group.mBigrams.iterator();
|
||||||
for (WeightedString bigram : group.mBigrams) {
|
while (bigramIterator.hasNext()) {
|
||||||
boolean more = remainingBigrams > 1;
|
final WeightedString bigram = (WeightedString)bigramIterator.next();
|
||||||
final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
|
final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
|
||||||
++groupAddress;
|
++groupAddress;
|
||||||
final int offset = addressOfBigram - groupAddress;
|
final int offset = addressOfBigram - groupAddress;
|
||||||
int bigramFlags = makeAttributeFlags(more, offset, bigram.mFrequency);
|
int bigramFlags = makeAttributeFlags(bigramIterator.hasNext(), offset,
|
||||||
|
bigram.mFrequency);
|
||||||
buffer[index++] = (byte)bigramFlags;
|
buffer[index++] = (byte)bigramFlags;
|
||||||
final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset));
|
final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset));
|
||||||
index += bigramShift;
|
index += bigramShift;
|
||||||
groupAddress += bigramShift;
|
groupAddress += bigramShift;
|
||||||
--remainingBigrams;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -819,14 +860,43 @@ public class BinaryDictInputOutput {
|
||||||
childrenAddress = NO_CHILDREN_ADDRESS;
|
childrenAddress = NO_CHILDREN_ADDRESS;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
ArrayList<PendingAttribute> shortcutTargets = null;
|
||||||
|
if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) {
|
||||||
|
shortcutTargets = new ArrayList<PendingAttribute>();
|
||||||
|
while (true) {
|
||||||
|
final int targetFlags = source.readUnsignedByte();
|
||||||
|
++addressPointer;
|
||||||
|
final int sign = 0 == (targetFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1;
|
||||||
|
int targetAddress = addressPointer;
|
||||||
|
switch (targetFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) {
|
||||||
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
|
||||||
|
targetAddress += sign * source.readUnsignedByte();
|
||||||
|
addressPointer += 1;
|
||||||
|
break;
|
||||||
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
|
||||||
|
targetAddress += sign * source.readUnsignedShort();
|
||||||
|
addressPointer += 2;
|
||||||
|
break;
|
||||||
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
|
||||||
|
final int offset = ((source.readUnsignedByte() << 16)
|
||||||
|
+ source.readUnsignedShort());
|
||||||
|
targetAddress += sign * offset;
|
||||||
|
addressPointer += 3;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new RuntimeException("Has attribute with no address");
|
||||||
|
}
|
||||||
|
shortcutTargets.add(new PendingAttribute(targetFlags & FLAG_ATTRIBUTE_FREQUENCY,
|
||||||
|
targetAddress));
|
||||||
|
if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
ArrayList<PendingAttribute> bigrams = null;
|
ArrayList<PendingAttribute> bigrams = null;
|
||||||
if (0 != (flags & FLAG_HAS_BIGRAMS)) {
|
if (0 != (flags & FLAG_HAS_BIGRAMS)) {
|
||||||
bigrams = new ArrayList<PendingAttribute>();
|
bigrams = new ArrayList<PendingAttribute>();
|
||||||
boolean more = true;
|
while (true) {
|
||||||
while (more) {
|
final int bigramFlags = source.readUnsignedByte();
|
||||||
int bigramFlags = source.readUnsignedByte();
|
|
||||||
++addressPointer;
|
++addressPointer;
|
||||||
more = (0 != (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT));
|
|
||||||
final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1;
|
final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1;
|
||||||
int bigramAddress = addressPointer;
|
int bigramAddress = addressPointer;
|
||||||
switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) {
|
switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) {
|
||||||
|
@ -849,10 +919,11 @@ public class BinaryDictInputOutput {
|
||||||
}
|
}
|
||||||
bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY,
|
bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY,
|
||||||
bigramAddress));
|
bigramAddress));
|
||||||
|
if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency,
|
return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency,
|
||||||
childrenAddress, bigrams);
|
childrenAddress, shortcutTargets, bigrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -930,6 +1001,14 @@ public class BinaryDictInputOutput {
|
||||||
int groupOffset = nodeOrigin + 1; // 1 byte for the group count
|
int groupOffset = nodeOrigin + 1; // 1 byte for the group count
|
||||||
for (int i = count; i > 0; --i) {
|
for (int i = count; i > 0; --i) {
|
||||||
CharGroupInfo info = readCharGroup(source, groupOffset);
|
CharGroupInfo info = readCharGroup(source, groupOffset);
|
||||||
|
ArrayList<WeightedString> shortcutTargets = null;
|
||||||
|
if (null != info.mShortcutTargets) {
|
||||||
|
shortcutTargets = new ArrayList<WeightedString>();
|
||||||
|
for (PendingAttribute target : info.mShortcutTargets) {
|
||||||
|
final String word = getWordAtAddress(source, headerSize, target.mAddress);
|
||||||
|
shortcutTargets.add(new WeightedString(word, target.mFrequency));
|
||||||
|
}
|
||||||
|
}
|
||||||
ArrayList<WeightedString> bigrams = null;
|
ArrayList<WeightedString> bigrams = null;
|
||||||
if (null != info.mBigrams) {
|
if (null != info.mBigrams) {
|
||||||
bigrams = new ArrayList<WeightedString>();
|
bigrams = new ArrayList<WeightedString>();
|
||||||
|
@ -947,13 +1026,11 @@ public class BinaryDictInputOutput {
|
||||||
source.seek(currentPosition);
|
source.seek(currentPosition);
|
||||||
}
|
}
|
||||||
nodeContents.add(
|
nodeContents.add(
|
||||||
// TODO: read and pass the shortcut targets
|
new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
|
||||||
new CharGroup(info.mCharacters, null, bigrams, info.mFrequency,
|
|
||||||
children));
|
children));
|
||||||
} else {
|
} else {
|
||||||
// TODO: read and pass the shortcut targets
|
|
||||||
nodeContents.add(
|
nodeContents.add(
|
||||||
new CharGroup(info.mCharacters, null, bigrams, info.mFrequency));
|
new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency));
|
||||||
}
|
}
|
||||||
groupOffset = info.mEndAddress;
|
groupOffset = info.mEndAddress;
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,10 +29,12 @@ public class CharGroupInfo {
|
||||||
public final int[] mCharacters;
|
public final int[] mCharacters;
|
||||||
public final int mFrequency;
|
public final int mFrequency;
|
||||||
public final int mChildrenAddress;
|
public final int mChildrenAddress;
|
||||||
|
public final ArrayList<PendingAttribute> mShortcutTargets;
|
||||||
public final ArrayList<PendingAttribute> mBigrams;
|
public final ArrayList<PendingAttribute> mBigrams;
|
||||||
|
|
||||||
public CharGroupInfo(final int originalAddress, final int endAddress, final int flags,
|
public CharGroupInfo(final int originalAddress, final int endAddress, final int flags,
|
||||||
final int[] characters, final int frequency, final int childrenAddress,
|
final int[] characters, final int frequency, final int childrenAddress,
|
||||||
|
final ArrayList<PendingAttribute> shortcutTargets,
|
||||||
final ArrayList<PendingAttribute> bigrams) {
|
final ArrayList<PendingAttribute> bigrams) {
|
||||||
mOriginalAddress = originalAddress;
|
mOriginalAddress = originalAddress;
|
||||||
mEndAddress = endAddress;
|
mEndAddress = endAddress;
|
||||||
|
@ -40,6 +42,7 @@ public class CharGroupInfo {
|
||||||
mCharacters = characters;
|
mCharacters = characters;
|
||||||
mFrequency = frequency;
|
mFrequency = frequency;
|
||||||
mChildrenAddress = childrenAddress;
|
mChildrenAddress = childrenAddress;
|
||||||
|
mShortcutTargets = shortcutTargets;
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue