am 5526d138: Merge "[FD2] Separate cached address before/after update for nodes."
* commit '5526d138fe76e7d19a58ab68c9ba7a0354172daa': [FD2] Separate cached address before/after update for nodes.main
commit
9b4d0c7047
|
@ -384,12 +384,13 @@ public final class BinaryDictInputOutput {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches
|
* Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches
|
||||||
* it in the 'actualSize' member of the node.
|
* it in the 'actualSize' member of the node, then returns it.
|
||||||
*
|
*
|
||||||
* @param node the node to compute the maximum size of.
|
* @param node the node to compute the maximum size of.
|
||||||
* @param options file format options.
|
* @param options file format options.
|
||||||
|
* @return the size of the node.
|
||||||
*/
|
*/
|
||||||
private static void setNodeMaximumSize(final Node node, final FormatOptions options) {
|
private static int calculateNodeMaximumSize(final Node node, final FormatOptions options) {
|
||||||
int size = getGroupCountSize(node);
|
int size = getGroupCountSize(node);
|
||||||
for (CharGroup g : node.mData) {
|
for (CharGroup g : node.mData) {
|
||||||
final int groupSize = getCharGroupMaximumSize(g, options);
|
final int groupSize = getCharGroupMaximumSize(g, options);
|
||||||
|
@ -400,6 +401,7 @@ public final class BinaryDictInputOutput {
|
||||||
size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
|
size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
|
||||||
}
|
}
|
||||||
node.mCachedSize = size;
|
node.mCachedSize = size;
|
||||||
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -548,17 +550,17 @@ public final class BinaryDictInputOutput {
|
||||||
boolean changed = false;
|
boolean changed = false;
|
||||||
int size = getGroupCountSize(node);
|
int size = getGroupCountSize(node);
|
||||||
for (CharGroup group : node.mData) {
|
for (CharGroup group : node.mData) {
|
||||||
if (group.mCachedAddress != node.mCachedAddress + size) {
|
if (group.mCachedAddress != node.mCachedAddressBeforeUpdate + size) {
|
||||||
changed = true;
|
changed = true;
|
||||||
group.mCachedAddress = node.mCachedAddress + size;
|
group.mCachedAddress = node.mCachedAddressBeforeUpdate + size;
|
||||||
}
|
}
|
||||||
int groupSize = getGroupHeaderSize(group, formatOptions);
|
int groupSize = getGroupHeaderSize(group, formatOptions);
|
||||||
if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE;
|
if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE;
|
||||||
if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) {
|
if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) {
|
||||||
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
|
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
|
||||||
} else if (null != group.mChildren) {
|
} else if (null != group.mChildren) {
|
||||||
final int offsetBasePoint = groupSize + node.mCachedAddress + size;
|
final int offsetBasePoint = groupSize + node.mCachedAddressBeforeUpdate + size;
|
||||||
final int offset = group.mChildren.mCachedAddress - offsetBasePoint;
|
final int offset = group.mChildren.mCachedAddressBeforeUpdate - offsetBasePoint;
|
||||||
if (formatOptions.mSupportsDynamicUpdate) {
|
if (formatOptions.mSupportsDynamicUpdate) {
|
||||||
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
|
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
|
||||||
} else {
|
} else {
|
||||||
|
@ -568,7 +570,7 @@ public final class BinaryDictInputOutput {
|
||||||
groupSize += getShortcutListSize(group.mShortcutTargets);
|
groupSize += getShortcutListSize(group.mShortcutTargets);
|
||||||
if (null != group.mBigrams) {
|
if (null != group.mBigrams) {
|
||||||
for (WeightedString bigram : group.mBigrams) {
|
for (WeightedString bigram : group.mBigrams) {
|
||||||
final int offsetBasePoint = groupSize + node.mCachedAddress + size
|
final int offsetBasePoint = groupSize + node.mCachedAddressBeforeUpdate + size
|
||||||
+ FormatSpec.GROUP_FLAGS_SIZE;
|
+ FormatSpec.GROUP_FLAGS_SIZE;
|
||||||
final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
|
final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
|
||||||
final int offset = addressOfBigram - offsetBasePoint;
|
final int offset = addressOfBigram - offsetBasePoint;
|
||||||
|
@ -595,11 +597,13 @@ public final class BinaryDictInputOutput {
|
||||||
* @param formatOptions file format options.
|
* @param formatOptions file format options.
|
||||||
* @return the byte size of the entire stack.
|
* @return the byte size of the entire stack.
|
||||||
*/
|
*/
|
||||||
|
// TODO: rename this method when all it does is fill back the cached addresses before update
|
||||||
|
// with cached addresses after update.
|
||||||
private static int stackNodes(final ArrayList<Node> flatNodes,
|
private static int stackNodes(final ArrayList<Node> flatNodes,
|
||||||
final FormatOptions formatOptions) {
|
final FormatOptions formatOptions) {
|
||||||
int nodeOffset = 0;
|
int nodeOffset = 0;
|
||||||
for (Node n : flatNodes) {
|
for (final Node n : flatNodes) {
|
||||||
n.mCachedAddress = nodeOffset;
|
n.mCachedAddressBeforeUpdate = n.mCachedAddressAfterUpdate;
|
||||||
int groupCountSize = getGroupCountSize(n);
|
int groupCountSize = getGroupCountSize(n);
|
||||||
int groupOffset = 0;
|
int groupOffset = 0;
|
||||||
for (CharGroup g : n.mData) {
|
for (CharGroup g : n.mData) {
|
||||||
|
@ -612,6 +616,10 @@ public final class BinaryDictInputOutput {
|
||||||
if (nodeSize != n.mCachedSize) {
|
if (nodeSize != n.mCachedSize) {
|
||||||
throw new RuntimeException("Bug : Stored and computed node size differ");
|
throw new RuntimeException("Bug : Stored and computed node size differ");
|
||||||
}
|
}
|
||||||
|
if (nodeOffset != n.mCachedAddressAfterUpdate) {
|
||||||
|
// TODO: remove this test when the code is well tested
|
||||||
|
throw new RuntimeException("Bug : Stored and computed node address differ");
|
||||||
|
}
|
||||||
nodeOffset += n.mCachedSize;
|
nodeOffset += n.mCachedSize;
|
||||||
}
|
}
|
||||||
return nodeOffset;
|
return nodeOffset;
|
||||||
|
@ -627,11 +635,13 @@ public final class BinaryDictInputOutput {
|
||||||
*/
|
*/
|
||||||
private static void computeParentAddresses(final ArrayList<Node> flatNodes) {
|
private static void computeParentAddresses(final ArrayList<Node> flatNodes) {
|
||||||
for (final Node node : flatNodes) {
|
for (final Node node : flatNodes) {
|
||||||
for (CharGroup group : node.mData) {
|
for (final CharGroup group : node.mData) {
|
||||||
if (null != group.mChildren) {
|
if (null != group.mChildren) {
|
||||||
// assign my address to children's parent address
|
// Assign my address to children's parent address
|
||||||
|
// Here BeforeUpdate and AfterUpdate addresses have the same value, so it
|
||||||
|
// does not matter which we use.
|
||||||
group.mChildren.mCachedParentAddress = group.mCachedAddress
|
group.mChildren.mCachedParentAddress = group.mCachedAddress
|
||||||
- group.mChildren.mCachedAddress;
|
- group.mChildren.mCachedAddressAfterUpdate;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -654,9 +664,13 @@ public final class BinaryDictInputOutput {
|
||||||
*/
|
*/
|
||||||
private static ArrayList<Node> computeAddresses(final FusionDictionary dict,
|
private static ArrayList<Node> computeAddresses(final FusionDictionary dict,
|
||||||
final ArrayList<Node> flatNodes, final FormatOptions formatOptions) {
|
final ArrayList<Node> flatNodes, final FormatOptions formatOptions) {
|
||||||
// First get the worst sizes and offsets
|
// First get the worst possible sizes and offsets
|
||||||
for (Node n : flatNodes) setNodeMaximumSize(n, formatOptions);
|
int offset = 0;
|
||||||
final int offset = stackNodes(flatNodes, formatOptions);
|
for (final Node n : flatNodes) {
|
||||||
|
n.mCachedAddressAfterUpdate = offset;
|
||||||
|
offset += calculateNodeMaximumSize(n, formatOptions);
|
||||||
|
}
|
||||||
|
offset = stackNodes(flatNodes, formatOptions);
|
||||||
|
|
||||||
MakedictLog.i("Compressing the array addresses. Original size : " + offset);
|
MakedictLog.i("Compressing the array addresses. Original size : " + offset);
|
||||||
MakedictLog.i("(Recursively seen size : " + offset + ")");
|
MakedictLog.i("(Recursively seen size : " + offset + ")");
|
||||||
|
@ -665,11 +679,14 @@ public final class BinaryDictInputOutput {
|
||||||
boolean changesDone = false;
|
boolean changesDone = false;
|
||||||
do {
|
do {
|
||||||
changesDone = false;
|
changesDone = false;
|
||||||
for (Node n : flatNodes) {
|
int nodeStartOffset = 0;
|
||||||
|
for (final Node n : flatNodes) {
|
||||||
|
n.mCachedAddressAfterUpdate = nodeStartOffset;
|
||||||
final int oldNodeSize = n.mCachedSize;
|
final int oldNodeSize = n.mCachedSize;
|
||||||
final boolean changed = computeActualNodeSize(n, dict, formatOptions);
|
final boolean changed = computeActualNodeSize(n, dict, formatOptions);
|
||||||
final int newNodeSize = n.mCachedSize;
|
final int newNodeSize = n.mCachedSize;
|
||||||
if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!");
|
if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!");
|
||||||
|
nodeStartOffset += newNodeSize;
|
||||||
changesDone |= changed;
|
changesDone |= changed;
|
||||||
}
|
}
|
||||||
stackNodes(flatNodes, formatOptions);
|
stackNodes(flatNodes, formatOptions);
|
||||||
|
@ -683,7 +700,7 @@ public final class BinaryDictInputOutput {
|
||||||
final Node lastNode = flatNodes.get(flatNodes.size() - 1);
|
final Node lastNode = flatNodes.get(flatNodes.size() - 1);
|
||||||
MakedictLog.i("Compression complete in " + passes + " passes.");
|
MakedictLog.i("Compression complete in " + passes + " passes.");
|
||||||
MakedictLog.i("After address compression : "
|
MakedictLog.i("After address compression : "
|
||||||
+ (lastNode.mCachedAddress + lastNode.mCachedSize));
|
+ (lastNode.mCachedAddressAfterUpdate + lastNode.mCachedSize));
|
||||||
|
|
||||||
return flatNodes;
|
return flatNodes;
|
||||||
}
|
}
|
||||||
|
@ -701,10 +718,12 @@ public final class BinaryDictInputOutput {
|
||||||
private static void checkFlatNodeArray(final ArrayList<Node> array) {
|
private static void checkFlatNodeArray(final ArrayList<Node> array) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
int index = 0;
|
int index = 0;
|
||||||
for (Node n : array) {
|
for (final Node n : array) {
|
||||||
if (n.mCachedAddress != offset) {
|
// BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter
|
||||||
|
// which we use.
|
||||||
|
if (n.mCachedAddressAfterUpdate != offset) {
|
||||||
throw new RuntimeException("Wrong address for node " + index
|
throw new RuntimeException("Wrong address for node " + index
|
||||||
+ " : expected " + offset + ", got " + n.mCachedAddress);
|
+ " : expected " + offset + ", got " + n.mCachedAddressAfterUpdate);
|
||||||
}
|
}
|
||||||
++index;
|
++index;
|
||||||
offset += n.mCachedSize;
|
offset += n.mCachedSize;
|
||||||
|
@ -946,7 +965,7 @@ public final class BinaryDictInputOutput {
|
||||||
private static int writePlacedNode(final FusionDictionary dict, byte[] buffer,
|
private static int writePlacedNode(final FusionDictionary dict, byte[] buffer,
|
||||||
final Node node, final FormatOptions formatOptions) {
|
final Node node, final FormatOptions formatOptions) {
|
||||||
// TODO: Make the code in common with BinaryDictIOUtils#writeCharGroup
|
// TODO: Make the code in common with BinaryDictIOUtils#writeCharGroup
|
||||||
int index = node.mCachedAddress;
|
int index = node.mCachedAddressAfterUpdate;
|
||||||
|
|
||||||
final int groupCount = node.mData.size();
|
final int groupCount = node.mData.size();
|
||||||
final int countSize = getGroupCountSize(node);
|
final int countSize = getGroupCountSize(node);
|
||||||
|
@ -977,7 +996,7 @@ public final class BinaryDictInputOutput {
|
||||||
if (group.mFrequency >= 0) groupAddress += FormatSpec.GROUP_FREQUENCY_SIZE;
|
if (group.mFrequency >= 0) groupAddress += FormatSpec.GROUP_FREQUENCY_SIZE;
|
||||||
final int childrenOffset = null == group.mChildren
|
final int childrenOffset = null == group.mChildren
|
||||||
? FormatSpec.NO_CHILDREN_ADDRESS
|
? FormatSpec.NO_CHILDREN_ADDRESS
|
||||||
: group.mChildren.mCachedAddress - groupAddress;
|
: group.mChildren.mCachedAddressAfterUpdate - groupAddress;
|
||||||
byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions);
|
byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions);
|
||||||
buffer[index++] = flags;
|
buffer[index++] = flags;
|
||||||
|
|
||||||
|
@ -985,7 +1004,7 @@ public final class BinaryDictInputOutput {
|
||||||
index = writeParentAddress(buffer, index, parentAddress, formatOptions);
|
index = writeParentAddress(buffer, index, parentAddress, formatOptions);
|
||||||
} else {
|
} else {
|
||||||
index = writeParentAddress(buffer, index,
|
index = writeParentAddress(buffer, index,
|
||||||
parentAddress + (node.mCachedAddress - group.mCachedAddress),
|
parentAddress + (node.mCachedAddressAfterUpdate - group.mCachedAddress),
|
||||||
formatOptions);
|
formatOptions);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1055,9 +1074,9 @@ public final class BinaryDictInputOutput {
|
||||||
= FormatSpec.NO_FORWARD_LINK_ADDRESS;
|
= FormatSpec.NO_FORWARD_LINK_ADDRESS;
|
||||||
index += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
|
index += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
|
||||||
}
|
}
|
||||||
if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException(
|
if (index != node.mCachedAddressAfterUpdate + node.mCachedSize) throw new RuntimeException(
|
||||||
"Not the same size : written "
|
"Not the same size : written "
|
||||||
+ (index - node.mCachedAddress) + " bytes out of a node that should have "
|
+ (index - node.mCachedAddressAfterUpdate) + " bytes from a node that should have "
|
||||||
+ node.mCachedSize + " bytes");
|
+ node.mCachedSize + " bytes");
|
||||||
return index;
|
return index;
|
||||||
}
|
}
|
||||||
|
@ -1077,25 +1096,27 @@ public final class BinaryDictInputOutput {
|
||||||
int charGroups = 0;
|
int charGroups = 0;
|
||||||
int maxGroups = 0;
|
int maxGroups = 0;
|
||||||
int maxRuns = 0;
|
int maxRuns = 0;
|
||||||
for (Node n : nodes) {
|
for (final Node n : nodes) {
|
||||||
if (maxGroups < n.mData.size()) maxGroups = n.mData.size();
|
if (maxGroups < n.mData.size()) maxGroups = n.mData.size();
|
||||||
for (CharGroup cg : n.mData) {
|
for (final CharGroup cg : n.mData) {
|
||||||
++charGroups;
|
++charGroups;
|
||||||
if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length;
|
if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length;
|
||||||
if (cg.mFrequency >= 0) {
|
if (cg.mFrequency >= 0) {
|
||||||
if (n.mCachedAddress < firstTerminalAddress)
|
if (n.mCachedAddressAfterUpdate < firstTerminalAddress)
|
||||||
firstTerminalAddress = n.mCachedAddress;
|
firstTerminalAddress = n.mCachedAddressAfterUpdate;
|
||||||
if (n.mCachedAddress > lastTerminalAddress)
|
if (n.mCachedAddressAfterUpdate > lastTerminalAddress)
|
||||||
lastTerminalAddress = n.mCachedAddress;
|
lastTerminalAddress = n.mCachedAddressAfterUpdate;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize;
|
if (n.mCachedAddressAfterUpdate + n.mCachedSize > size) {
|
||||||
|
size = n.mCachedAddressAfterUpdate + n.mCachedSize;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
final int[] groupCounts = new int[maxGroups + 1];
|
final int[] groupCounts = new int[maxGroups + 1];
|
||||||
final int[] runCounts = new int[maxRuns + 1];
|
final int[] runCounts = new int[maxRuns + 1];
|
||||||
for (Node n : nodes) {
|
for (final Node n : nodes) {
|
||||||
++groupCounts[n.mData.size()];
|
++groupCounts[n.mData.size()];
|
||||||
for (CharGroup cg : n.mData) {
|
for (final CharGroup cg : n.mData) {
|
||||||
++runCounts[cg.mChars.length];
|
++runCounts[cg.mChars.length];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1205,7 +1226,7 @@ public final class BinaryDictInputOutput {
|
||||||
|
|
||||||
// Create a buffer that matches the final dictionary size.
|
// Create a buffer that matches the final dictionary size.
|
||||||
final Node lastNode = flatNodes.get(flatNodes.size() - 1);
|
final Node lastNode = flatNodes.get(flatNodes.size() - 1);
|
||||||
final int bufferSize = lastNode.mCachedAddress + lastNode.mCachedSize;
|
final int bufferSize = lastNode.mCachedAddressAfterUpdate + lastNode.mCachedSize;
|
||||||
final byte[] buffer = new byte[bufferSize];
|
final byte[] buffer = new byte[bufferSize];
|
||||||
int index = 0;
|
int index = 0;
|
||||||
|
|
||||||
|
@ -1584,8 +1605,9 @@ public final class BinaryDictInputOutput {
|
||||||
buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS);
|
buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS);
|
||||||
|
|
||||||
final Node node = new Node(nodeContents);
|
final Node node = new Node(nodeContents);
|
||||||
node.mCachedAddress = nodeOrigin;
|
node.mCachedAddressBeforeUpdate = nodeOrigin;
|
||||||
reverseNodeMap.put(node.mCachedAddress, node);
|
node.mCachedAddressAfterUpdate = nodeOrigin;
|
||||||
|
reverseNodeMap.put(node.mCachedAddressAfterUpdate, node);
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,13 @@ public final class FusionDictionary implements Iterable<Word> {
|
||||||
ArrayList<CharGroup> mData;
|
ArrayList<CharGroup> mData;
|
||||||
// To help with binary generation
|
// To help with binary generation
|
||||||
int mCachedSize = Integer.MIN_VALUE;
|
int mCachedSize = Integer.MIN_VALUE;
|
||||||
int mCachedAddress = Integer.MIN_VALUE;
|
// mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They
|
||||||
|
// always hold the same value except between dictionary address compression, during which
|
||||||
|
// the update process needs to know about both values at the same time. Updating will
|
||||||
|
// update the AfterUpdate value, and the code will move them to BeforeUpdate before
|
||||||
|
// the next update pass.
|
||||||
|
int mCachedAddressBeforeUpdate = Integer.MIN_VALUE;
|
||||||
|
int mCachedAddressAfterUpdate = Integer.MIN_VALUE;
|
||||||
int mCachedParentAddress = 0;
|
int mCachedParentAddress = 0;
|
||||||
|
|
||||||
public Node() {
|
public Node() {
|
||||||
|
|
Loading…
Reference in New Issue