[FD2] Separate cached address before/after update for nodes.

Bug: 8526576
Change-Id: Ib9f8594a9e12dc75eba296faff2612c4bd7483d3
main
Jean Chalard 2013-07-10 19:23:03 +09:00
parent 62f3b4e9c9
commit 91cbe3566d
2 changed files with 66 additions and 38 deletions

View File

@ -384,12 +384,13 @@ public final class BinaryDictInputOutput {
/** /**
* Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches
* it in the 'actualSize' member of the node. * it in the 'actualSize' member of the node, then returns it.
* *
* @param node the node to compute the maximum size of. * @param node the node to compute the maximum size of.
* @param options file format options. * @param options file format options.
* @return the size of the node.
*/ */
private static void setNodeMaximumSize(final Node node, final FormatOptions options) { private static int calculateNodeMaximumSize(final Node node, final FormatOptions options) {
int size = getGroupCountSize(node); int size = getGroupCountSize(node);
for (CharGroup g : node.mData) { for (CharGroup g : node.mData) {
final int groupSize = getCharGroupMaximumSize(g, options); final int groupSize = getCharGroupMaximumSize(g, options);
@ -400,6 +401,7 @@ public final class BinaryDictInputOutput {
size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
} }
node.mCachedSize = size; node.mCachedSize = size;
return size;
} }
/** /**
@ -548,17 +550,17 @@ public final class BinaryDictInputOutput {
boolean changed = false; boolean changed = false;
int size = getGroupCountSize(node); int size = getGroupCountSize(node);
for (CharGroup group : node.mData) { for (CharGroup group : node.mData) {
if (group.mCachedAddress != node.mCachedAddress + size) { if (group.mCachedAddress != node.mCachedAddressBeforeUpdate + size) {
changed = true; changed = true;
group.mCachedAddress = node.mCachedAddress + size; group.mCachedAddress = node.mCachedAddressBeforeUpdate + size;
} }
int groupSize = getGroupHeaderSize(group, formatOptions); int groupSize = getGroupHeaderSize(group, formatOptions);
if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE; if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE;
if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) { if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) {
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
} else if (null != group.mChildren) { } else if (null != group.mChildren) {
final int offsetBasePoint = groupSize + node.mCachedAddress + size; final int offsetBasePoint = groupSize + node.mCachedAddressBeforeUpdate + size;
final int offset = group.mChildren.mCachedAddress - offsetBasePoint; final int offset = group.mChildren.mCachedAddressBeforeUpdate - offsetBasePoint;
if (formatOptions.mSupportsDynamicUpdate) { if (formatOptions.mSupportsDynamicUpdate) {
groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
} else { } else {
@ -568,7 +570,7 @@ public final class BinaryDictInputOutput {
groupSize += getShortcutListSize(group.mShortcutTargets); groupSize += getShortcutListSize(group.mShortcutTargets);
if (null != group.mBigrams) { if (null != group.mBigrams) {
for (WeightedString bigram : group.mBigrams) { for (WeightedString bigram : group.mBigrams) {
final int offsetBasePoint = groupSize + node.mCachedAddress + size final int offsetBasePoint = groupSize + node.mCachedAddressBeforeUpdate + size
+ FormatSpec.GROUP_FLAGS_SIZE; + FormatSpec.GROUP_FLAGS_SIZE;
final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
final int offset = addressOfBigram - offsetBasePoint; final int offset = addressOfBigram - offsetBasePoint;
@ -595,11 +597,13 @@ public final class BinaryDictInputOutput {
* @param formatOptions file format options. * @param formatOptions file format options.
* @return the byte size of the entire stack. * @return the byte size of the entire stack.
*/ */
// TODO: rename this method when all it does is fill back the cached addresses before update
// with cached addresses after update.
private static int stackNodes(final ArrayList<Node> flatNodes, private static int stackNodes(final ArrayList<Node> flatNodes,
final FormatOptions formatOptions) { final FormatOptions formatOptions) {
int nodeOffset = 0; int nodeOffset = 0;
for (Node n : flatNodes) { for (final Node n : flatNodes) {
n.mCachedAddress = nodeOffset; n.mCachedAddressBeforeUpdate = n.mCachedAddressAfterUpdate;
int groupCountSize = getGroupCountSize(n); int groupCountSize = getGroupCountSize(n);
int groupOffset = 0; int groupOffset = 0;
for (CharGroup g : n.mData) { for (CharGroup g : n.mData) {
@ -612,6 +616,10 @@ public final class BinaryDictInputOutput {
if (nodeSize != n.mCachedSize) { if (nodeSize != n.mCachedSize) {
throw new RuntimeException("Bug : Stored and computed node size differ"); throw new RuntimeException("Bug : Stored and computed node size differ");
} }
if (nodeOffset != n.mCachedAddressAfterUpdate) {
// TODO: remove this test when the code is well tested
throw new RuntimeException("Bug : Stored and computed node address differ");
}
nodeOffset += n.mCachedSize; nodeOffset += n.mCachedSize;
} }
return nodeOffset; return nodeOffset;
@ -627,11 +635,13 @@ public final class BinaryDictInputOutput {
*/ */
private static void computeParentAddresses(final ArrayList<Node> flatNodes) { private static void computeParentAddresses(final ArrayList<Node> flatNodes) {
for (final Node node : flatNodes) { for (final Node node : flatNodes) {
for (CharGroup group : node.mData) { for (final CharGroup group : node.mData) {
if (null != group.mChildren) { if (null != group.mChildren) {
// assign my address to children's parent address // Assign my address to children's parent address
// Here BeforeUpdate and AfterUpdate addresses have the same value, so it
// does not matter which we use.
group.mChildren.mCachedParentAddress = group.mCachedAddress group.mChildren.mCachedParentAddress = group.mCachedAddress
- group.mChildren.mCachedAddress; - group.mChildren.mCachedAddressAfterUpdate;
} }
} }
} }
@ -654,9 +664,13 @@ public final class BinaryDictInputOutput {
*/ */
private static ArrayList<Node> computeAddresses(final FusionDictionary dict, private static ArrayList<Node> computeAddresses(final FusionDictionary dict,
final ArrayList<Node> flatNodes, final FormatOptions formatOptions) { final ArrayList<Node> flatNodes, final FormatOptions formatOptions) {
// First get the worst sizes and offsets // First get the worst possible sizes and offsets
for (Node n : flatNodes) setNodeMaximumSize(n, formatOptions); int offset = 0;
final int offset = stackNodes(flatNodes, formatOptions); for (final Node n : flatNodes) {
n.mCachedAddressAfterUpdate = offset;
offset += calculateNodeMaximumSize(n, formatOptions);
}
offset = stackNodes(flatNodes, formatOptions);
MakedictLog.i("Compressing the array addresses. Original size : " + offset); MakedictLog.i("Compressing the array addresses. Original size : " + offset);
MakedictLog.i("(Recursively seen size : " + offset + ")"); MakedictLog.i("(Recursively seen size : " + offset + ")");
@ -665,11 +679,14 @@ public final class BinaryDictInputOutput {
boolean changesDone = false; boolean changesDone = false;
do { do {
changesDone = false; changesDone = false;
for (Node n : flatNodes) { int nodeStartOffset = 0;
for (final Node n : flatNodes) {
n.mCachedAddressAfterUpdate = nodeStartOffset;
final int oldNodeSize = n.mCachedSize; final int oldNodeSize = n.mCachedSize;
final boolean changed = computeActualNodeSize(n, dict, formatOptions); final boolean changed = computeActualNodeSize(n, dict, formatOptions);
final int newNodeSize = n.mCachedSize; final int newNodeSize = n.mCachedSize;
if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!"); if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!");
nodeStartOffset += newNodeSize;
changesDone |= changed; changesDone |= changed;
} }
stackNodes(flatNodes, formatOptions); stackNodes(flatNodes, formatOptions);
@ -683,7 +700,7 @@ public final class BinaryDictInputOutput {
final Node lastNode = flatNodes.get(flatNodes.size() - 1); final Node lastNode = flatNodes.get(flatNodes.size() - 1);
MakedictLog.i("Compression complete in " + passes + " passes."); MakedictLog.i("Compression complete in " + passes + " passes.");
MakedictLog.i("After address compression : " MakedictLog.i("After address compression : "
+ (lastNode.mCachedAddress + lastNode.mCachedSize)); + (lastNode.mCachedAddressAfterUpdate + lastNode.mCachedSize));
return flatNodes; return flatNodes;
} }
@ -701,10 +718,12 @@ public final class BinaryDictInputOutput {
private static void checkFlatNodeArray(final ArrayList<Node> array) { private static void checkFlatNodeArray(final ArrayList<Node> array) {
int offset = 0; int offset = 0;
int index = 0; int index = 0;
for (Node n : array) { for (final Node n : array) {
if (n.mCachedAddress != offset) { // BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter
// which we use.
if (n.mCachedAddressAfterUpdate != offset) {
throw new RuntimeException("Wrong address for node " + index throw new RuntimeException("Wrong address for node " + index
+ " : expected " + offset + ", got " + n.mCachedAddress); + " : expected " + offset + ", got " + n.mCachedAddressAfterUpdate);
} }
++index; ++index;
offset += n.mCachedSize; offset += n.mCachedSize;
@ -946,7 +965,7 @@ public final class BinaryDictInputOutput {
private static int writePlacedNode(final FusionDictionary dict, byte[] buffer, private static int writePlacedNode(final FusionDictionary dict, byte[] buffer,
final Node node, final FormatOptions formatOptions) { final Node node, final FormatOptions formatOptions) {
// TODO: Make the code in common with BinaryDictIOUtils#writeCharGroup // TODO: Make the code in common with BinaryDictIOUtils#writeCharGroup
int index = node.mCachedAddress; int index = node.mCachedAddressAfterUpdate;
final int groupCount = node.mData.size(); final int groupCount = node.mData.size();
final int countSize = getGroupCountSize(node); final int countSize = getGroupCountSize(node);
@ -977,7 +996,7 @@ public final class BinaryDictInputOutput {
if (group.mFrequency >= 0) groupAddress += FormatSpec.GROUP_FREQUENCY_SIZE; if (group.mFrequency >= 0) groupAddress += FormatSpec.GROUP_FREQUENCY_SIZE;
final int childrenOffset = null == group.mChildren final int childrenOffset = null == group.mChildren
? FormatSpec.NO_CHILDREN_ADDRESS ? FormatSpec.NO_CHILDREN_ADDRESS
: group.mChildren.mCachedAddress - groupAddress; : group.mChildren.mCachedAddressAfterUpdate - groupAddress;
byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions); byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions);
buffer[index++] = flags; buffer[index++] = flags;
@ -985,7 +1004,7 @@ public final class BinaryDictInputOutput {
index = writeParentAddress(buffer, index, parentAddress, formatOptions); index = writeParentAddress(buffer, index, parentAddress, formatOptions);
} else { } else {
index = writeParentAddress(buffer, index, index = writeParentAddress(buffer, index,
parentAddress + (node.mCachedAddress - group.mCachedAddress), parentAddress + (node.mCachedAddressAfterUpdate - group.mCachedAddress),
formatOptions); formatOptions);
} }
@ -1055,9 +1074,9 @@ public final class BinaryDictInputOutput {
= FormatSpec.NO_FORWARD_LINK_ADDRESS; = FormatSpec.NO_FORWARD_LINK_ADDRESS;
index += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; index += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
} }
if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException( if (index != node.mCachedAddressAfterUpdate + node.mCachedSize) throw new RuntimeException(
"Not the same size : written " "Not the same size : written "
+ (index - node.mCachedAddress) + " bytes out of a node that should have " + (index - node.mCachedAddressAfterUpdate) + " bytes from a node that should have "
+ node.mCachedSize + " bytes"); + node.mCachedSize + " bytes");
return index; return index;
} }
@ -1077,25 +1096,27 @@ public final class BinaryDictInputOutput {
int charGroups = 0; int charGroups = 0;
int maxGroups = 0; int maxGroups = 0;
int maxRuns = 0; int maxRuns = 0;
for (Node n : nodes) { for (final Node n : nodes) {
if (maxGroups < n.mData.size()) maxGroups = n.mData.size(); if (maxGroups < n.mData.size()) maxGroups = n.mData.size();
for (CharGroup cg : n.mData) { for (final CharGroup cg : n.mData) {
++charGroups; ++charGroups;
if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length; if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length;
if (cg.mFrequency >= 0) { if (cg.mFrequency >= 0) {
if (n.mCachedAddress < firstTerminalAddress) if (n.mCachedAddressAfterUpdate < firstTerminalAddress)
firstTerminalAddress = n.mCachedAddress; firstTerminalAddress = n.mCachedAddressAfterUpdate;
if (n.mCachedAddress > lastTerminalAddress) if (n.mCachedAddressAfterUpdate > lastTerminalAddress)
lastTerminalAddress = n.mCachedAddress; lastTerminalAddress = n.mCachedAddressAfterUpdate;
} }
} }
if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize; if (n.mCachedAddressAfterUpdate + n.mCachedSize > size) {
size = n.mCachedAddressAfterUpdate + n.mCachedSize;
}
} }
final int[] groupCounts = new int[maxGroups + 1]; final int[] groupCounts = new int[maxGroups + 1];
final int[] runCounts = new int[maxRuns + 1]; final int[] runCounts = new int[maxRuns + 1];
for (Node n : nodes) { for (final Node n : nodes) {
++groupCounts[n.mData.size()]; ++groupCounts[n.mData.size()];
for (CharGroup cg : n.mData) { for (final CharGroup cg : n.mData) {
++runCounts[cg.mChars.length]; ++runCounts[cg.mChars.length];
} }
} }
@ -1205,7 +1226,7 @@ public final class BinaryDictInputOutput {
// Create a buffer that matches the final dictionary size. // Create a buffer that matches the final dictionary size.
final Node lastNode = flatNodes.get(flatNodes.size() - 1); final Node lastNode = flatNodes.get(flatNodes.size() - 1);
final int bufferSize = lastNode.mCachedAddress + lastNode.mCachedSize; final int bufferSize = lastNode.mCachedAddressAfterUpdate + lastNode.mCachedSize;
final byte[] buffer = new byte[bufferSize]; final byte[] buffer = new byte[bufferSize];
int index = 0; int index = 0;
@ -1584,8 +1605,9 @@ public final class BinaryDictInputOutput {
buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS);
final Node node = new Node(nodeContents); final Node node = new Node(nodeContents);
node.mCachedAddress = nodeOrigin; node.mCachedAddressBeforeUpdate = nodeOrigin;
reverseNodeMap.put(node.mCachedAddress, node); node.mCachedAddressAfterUpdate = nodeOrigin;
reverseNodeMap.put(node.mCachedAddressAfterUpdate, node);
return node; return node;
} }

View File

@ -46,7 +46,13 @@ public final class FusionDictionary implements Iterable<Word> {
ArrayList<CharGroup> mData; ArrayList<CharGroup> mData;
// To help with binary generation // To help with binary generation
int mCachedSize = Integer.MIN_VALUE; int mCachedSize = Integer.MIN_VALUE;
int mCachedAddress = Integer.MIN_VALUE; // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They
// always hold the same value except between dictionary address compression, during which
// the update process needs to know about both values at the same time. Updating will
// update the AfterUpdate value, and the code will move them to BeforeUpdate before
// the next update pass.
int mCachedAddressBeforeUpdate = Integer.MIN_VALUE;
int mCachedAddressAfterUpdate = Integer.MIN_VALUE;
int mCachedParentAddress = 0; int mCachedParentAddress = 0;
public Node() { public Node() {